acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/pydantic_migration_helpers.py +7 -5
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1522 -569
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17758 -17097
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/sdk/search_filters.py +95 -27
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +56 -14
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
|
@@ -32,6 +32,7 @@ from datahub.ingestion.api.source import (
|
|
|
32
32
|
)
|
|
33
33
|
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
34
34
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
35
|
+
from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
|
|
35
36
|
from datahub.ingestion.source.snowflake.constants import (
|
|
36
37
|
GENERIC_PERMISSION_ERROR_KEY,
|
|
37
38
|
SnowflakeEdition,
|
|
@@ -97,7 +98,14 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
97
98
|
@support_status(SupportStatus.CERTIFIED)
|
|
98
99
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
99
100
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
100
|
-
@capability(
|
|
101
|
+
@capability(
|
|
102
|
+
SourceCapability.CONTAINERS,
|
|
103
|
+
"Enabled by default",
|
|
104
|
+
subtype_modifier=[
|
|
105
|
+
SourceCapabilityModifier.DATABASE,
|
|
106
|
+
SourceCapabilityModifier.SCHEMA,
|
|
107
|
+
],
|
|
108
|
+
)
|
|
101
109
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
102
110
|
@capability(
|
|
103
111
|
SourceCapability.DATA_PROFILING,
|
|
@@ -118,7 +126,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
118
126
|
)
|
|
119
127
|
@capability(
|
|
120
128
|
SourceCapability.DELETION_DETECTION,
|
|
121
|
-
"
|
|
129
|
+
"Enabled by default via stateful ingestion",
|
|
122
130
|
supported=True,
|
|
123
131
|
)
|
|
124
132
|
@capability(
|
|
@@ -131,6 +139,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
131
139
|
"Optionally enabled via `classification.enabled`",
|
|
132
140
|
supported=True,
|
|
133
141
|
)
|
|
142
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
134
143
|
class SnowflakeV2Source(
|
|
135
144
|
SnowflakeCommonMixin,
|
|
136
145
|
StatefulIngestionSourceBase,
|
|
@@ -311,6 +320,7 @@ class SnowflakeV2Source(
|
|
|
311
320
|
SourceCapability.PLATFORM_INSTANCE,
|
|
312
321
|
SourceCapability.DOMAINS,
|
|
313
322
|
SourceCapability.DELETION_DETECTION,
|
|
323
|
+
SourceCapability.TEST_CONNECTION,
|
|
314
324
|
)
|
|
315
325
|
]
|
|
316
326
|
|
|
@@ -575,6 +585,7 @@ class SnowflakeV2Source(
|
|
|
575
585
|
|
|
576
586
|
queries_extractor = SnowflakeQueriesExtractor(
|
|
577
587
|
connection=self.connection,
|
|
588
|
+
# TODO: this should be its own section in main recipe
|
|
578
589
|
config=SnowflakeQueriesExtractorConfig(
|
|
579
590
|
window=BaseTimeWindowConfig(
|
|
580
591
|
start_time=self.config.start_time,
|
|
@@ -589,6 +600,9 @@ class SnowflakeV2Source(
|
|
|
589
600
|
include_query_usage_statistics=self.config.include_query_usage_statistics,
|
|
590
601
|
user_email_pattern=self.config.user_email_pattern,
|
|
591
602
|
pushdown_deny_usernames=self.config.pushdown_deny_usernames,
|
|
603
|
+
query_dedup_strategy=self.config.query_dedup_strategy,
|
|
604
|
+
push_down_database_pattern_access_history=self.config.push_down_database_pattern_access_history,
|
|
605
|
+
additional_database_names_allowlist=self.config.additional_database_names_allowlist,
|
|
592
606
|
),
|
|
593
607
|
structured_report=self.report,
|
|
594
608
|
filters=self.filters,
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any, Iterable, List, Optional
|
|
5
|
+
|
|
6
|
+
from datahub.ingestion.api.closeable import Closeable
|
|
7
|
+
from datahub.metadata.urns import CorpUserUrn
|
|
8
|
+
from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
9
|
+
PreparsedQuery,
|
|
10
|
+
UrnStr,
|
|
11
|
+
)
|
|
12
|
+
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
13
|
+
from datahub.utilities.file_backed_collections import FileBackedDict
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclasses.dataclass
|
|
17
|
+
class StoredProcCall:
|
|
18
|
+
snowflake_root_query_id: str
|
|
19
|
+
|
|
20
|
+
# Query text will typically be something like:
|
|
21
|
+
# "CALL SALES_FORECASTING.CUSTOMER_ANALYSIS_PROC();"
|
|
22
|
+
query_text: str
|
|
23
|
+
|
|
24
|
+
timestamp: datetime
|
|
25
|
+
user: CorpUserUrn
|
|
26
|
+
default_db: str
|
|
27
|
+
default_schema: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class StoredProcExecutionLineage:
|
|
32
|
+
call: StoredProcCall
|
|
33
|
+
|
|
34
|
+
inputs: List[UrnStr]
|
|
35
|
+
outputs: List[UrnStr]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class StoredProcLineageReport:
|
|
40
|
+
num_stored_proc_calls: int = 0
|
|
41
|
+
num_related_queries: int = 0
|
|
42
|
+
num_related_queries_without_proc_call: int = 0
|
|
43
|
+
|
|
44
|
+
# Incremented at generation/build time.
|
|
45
|
+
num_stored_proc_lineage_entries: int = 0
|
|
46
|
+
num_stored_proc_calls_with_no_inputs: int = 0
|
|
47
|
+
num_stored_proc_calls_with_no_outputs: int = 0
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class StoredProcLineageTracker(Closeable):
|
|
51
|
+
"""
|
|
52
|
+
Tracks table-level lineage for Snowflake stored procedures.
|
|
53
|
+
|
|
54
|
+
Stored procedures in Snowflake trigger multiple SQL queries during execution.
|
|
55
|
+
Snowflake assigns each stored procedure call a unique query_id and uses this as the
|
|
56
|
+
root_query_id for all subsequent queries executed within that procedure. This allows
|
|
57
|
+
us to trace which queries belong to a specific stored procedure execution and build
|
|
58
|
+
table-level lineage by aggregating inputs/outputs from all related queries.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(self, platform: str, shared_connection: Optional[Any] = None):
|
|
62
|
+
self.platform = platform
|
|
63
|
+
self.report = StoredProcLineageReport()
|
|
64
|
+
|
|
65
|
+
# { root_query_id -> StoredProcExecutionLineage }
|
|
66
|
+
self._stored_proc_execution_lineage: FileBackedDict[
|
|
67
|
+
StoredProcExecutionLineage
|
|
68
|
+
] = FileBackedDict(shared_connection, tablename="stored_proc_lineage")
|
|
69
|
+
|
|
70
|
+
def add_stored_proc_call(self, call: StoredProcCall) -> None:
|
|
71
|
+
"""Add a stored procedure call to track."""
|
|
72
|
+
self._stored_proc_execution_lineage[call.snowflake_root_query_id] = (
|
|
73
|
+
StoredProcExecutionLineage(
|
|
74
|
+
call=call,
|
|
75
|
+
# Will be populated by subsequent queries.
|
|
76
|
+
inputs=[],
|
|
77
|
+
outputs=[],
|
|
78
|
+
)
|
|
79
|
+
)
|
|
80
|
+
self.report.num_stored_proc_calls += 1
|
|
81
|
+
|
|
82
|
+
def add_related_query(self, query: PreparsedQuery) -> bool:
|
|
83
|
+
"""Add a query that might be related to a stored procedure execution.
|
|
84
|
+
|
|
85
|
+
Returns True if the query was added to a stored procedure execution, False otherwise.
|
|
86
|
+
"""
|
|
87
|
+
snowflake_root_query_id = (query.extra_info or {}).get(
|
|
88
|
+
"snowflake_root_query_id"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if snowflake_root_query_id:
|
|
92
|
+
if snowflake_root_query_id not in self._stored_proc_execution_lineage:
|
|
93
|
+
self.report.num_related_queries_without_proc_call += 1
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
stored_proc_execution = self._stored_proc_execution_lineage.for_mutation(
|
|
97
|
+
snowflake_root_query_id
|
|
98
|
+
)
|
|
99
|
+
stored_proc_execution.inputs.extend(query.upstreams)
|
|
100
|
+
if query.downstream is not None:
|
|
101
|
+
stored_proc_execution.outputs.append(query.downstream)
|
|
102
|
+
self.report.num_related_queries += 1
|
|
103
|
+
return True
|
|
104
|
+
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
def build_merged_lineage_entries(self) -> Iterable[PreparsedQuery]:
|
|
108
|
+
# For stored procedures, we can only get table-level lineage from the audit log.
|
|
109
|
+
# We represent these as PreparsedQuery objects for now. Eventually we'll want to
|
|
110
|
+
# create dataJobInputOutput lineage instead.
|
|
111
|
+
|
|
112
|
+
for stored_proc_execution in self._stored_proc_execution_lineage.values():
|
|
113
|
+
if not stored_proc_execution.inputs:
|
|
114
|
+
self.report.num_stored_proc_calls_with_no_inputs += 1
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
if not stored_proc_execution.outputs:
|
|
118
|
+
self.report.num_stored_proc_calls_with_no_outputs += 1
|
|
119
|
+
# Still continue to generate lineage for cases where we have inputs but no outputs
|
|
120
|
+
|
|
121
|
+
for downstream in stored_proc_execution.outputs:
|
|
122
|
+
stored_proc_query_id = get_query_fingerprint(
|
|
123
|
+
stored_proc_execution.call.query_text,
|
|
124
|
+
self.platform,
|
|
125
|
+
fast=True,
|
|
126
|
+
secondary_id=downstream,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
lineage_entry = PreparsedQuery(
|
|
130
|
+
query_id=stored_proc_query_id,
|
|
131
|
+
query_text=stored_proc_execution.call.query_text,
|
|
132
|
+
upstreams=stored_proc_execution.inputs,
|
|
133
|
+
downstream=downstream,
|
|
134
|
+
query_count=0,
|
|
135
|
+
user=stored_proc_execution.call.user,
|
|
136
|
+
timestamp=stored_proc_execution.call.timestamp,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
self.report.num_stored_proc_lineage_entries += 1
|
|
140
|
+
yield lineage_entry
|
|
141
|
+
|
|
142
|
+
def close(self) -> None:
|
|
143
|
+
self._stored_proc_execution_lineage.close()
|
|
@@ -29,8 +29,14 @@ from datahub.ingestion.api.decorators import (
|
|
|
29
29
|
from datahub.ingestion.api.source import StructuredLogLevel
|
|
30
30
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
31
31
|
from datahub.ingestion.source.aws.s3_util import make_s3_urn
|
|
32
|
-
from datahub.ingestion.source.common.subtypes import
|
|
32
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
33
|
+
DatasetContainerSubTypes,
|
|
34
|
+
SourceCapabilityModifier,
|
|
35
|
+
)
|
|
33
36
|
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
|
|
37
|
+
from datahub.ingestion.source.sql.athena_properties_extractor import (
|
|
38
|
+
AthenaPropertiesExtractor,
|
|
39
|
+
)
|
|
34
40
|
from datahub.ingestion.source.sql.sql_common import (
|
|
35
41
|
SQLAlchemySource,
|
|
36
42
|
register_custom_type,
|
|
@@ -44,12 +50,17 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|
|
44
50
|
)
|
|
45
51
|
from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
|
|
46
52
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
|
47
|
-
from datahub.metadata.schema_classes import
|
|
53
|
+
from datahub.metadata.schema_classes import (
|
|
54
|
+
ArrayTypeClass,
|
|
55
|
+
MapTypeClass,
|
|
56
|
+
RecordTypeClass,
|
|
57
|
+
)
|
|
48
58
|
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
|
|
49
59
|
from datahub.utilities.sqlalchemy_type_converter import (
|
|
50
60
|
MapType,
|
|
51
61
|
get_schema_fields_for_sqlalchemy_column,
|
|
52
62
|
)
|
|
63
|
+
from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
|
|
53
64
|
|
|
54
65
|
try:
|
|
55
66
|
from typing_extensions import override
|
|
@@ -281,12 +292,22 @@ class AthenaConfig(SQLCommonConfig):
|
|
|
281
292
|
description="Extract partitions for tables. Partition extraction needs to run a query (`select * from table$partitions`) on the table. Disable this if you don't want to grant select permission.",
|
|
282
293
|
)
|
|
283
294
|
|
|
295
|
+
extract_partitions_using_create_statements: bool = pydantic.Field(
|
|
296
|
+
default=False,
|
|
297
|
+
description="Extract partitions using the `SHOW CREATE TABLE` statement instead of querying the table's partitions directly. This needs to be enabled to extract Iceberg partitions. If extraction fails it falls back to the default partition extraction. This is experimental.",
|
|
298
|
+
)
|
|
299
|
+
|
|
284
300
|
_s3_staging_dir_population = pydantic_renamed_field(
|
|
285
301
|
old_name="s3_staging_dir",
|
|
286
302
|
new_name="query_result_location",
|
|
287
303
|
print_warning=True,
|
|
288
304
|
)
|
|
289
305
|
|
|
306
|
+
emit_schema_fieldpaths_as_v1: bool = pydantic.Field(
|
|
307
|
+
default=False,
|
|
308
|
+
description="Convert simple field paths to DataHub field path v1 format. Simple column paths are those that do not contain any nested fields.",
|
|
309
|
+
)
|
|
310
|
+
|
|
290
311
|
profiling: AthenaProfilingConfig = AthenaProfilingConfig()
|
|
291
312
|
|
|
292
313
|
def get_sql_alchemy_url(self):
|
|
@@ -321,8 +342,18 @@ class Partitionitem:
|
|
|
321
342
|
@capability(
|
|
322
343
|
SourceCapability.DATA_PROFILING,
|
|
323
344
|
"Optionally enabled via configuration. Profiling uses sql queries on whole table which can be expensive operation.",
|
|
345
|
+
subtype_modifier=[SourceCapabilityModifier.TABLE],
|
|
346
|
+
)
|
|
347
|
+
@capability(
|
|
348
|
+
SourceCapability.LINEAGE_COARSE,
|
|
349
|
+
"Supported for S3 tables",
|
|
350
|
+
subtype_modifier=[SourceCapabilityModifier.TABLE],
|
|
351
|
+
)
|
|
352
|
+
@capability(
|
|
353
|
+
SourceCapability.LINEAGE_FINE,
|
|
354
|
+
"Supported for S3 tables",
|
|
355
|
+
subtype_modifier=[SourceCapabilityModifier.TABLE],
|
|
324
356
|
)
|
|
325
|
-
@capability(SourceCapability.LINEAGE_COARSE, "Supported for S3 tables")
|
|
326
357
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
327
358
|
class AthenaSource(SQLAlchemySource):
|
|
328
359
|
"""
|
|
@@ -483,23 +514,38 @@ class AthenaSource(SQLAlchemySource):
|
|
|
483
514
|
def get_partitions(
|
|
484
515
|
self, inspector: Inspector, schema: str, table: str
|
|
485
516
|
) -> Optional[List[str]]:
|
|
486
|
-
if
|
|
517
|
+
if (
|
|
518
|
+
not self.config.extract_partitions
|
|
519
|
+
and not self.config.extract_partitions_using_create_statements
|
|
520
|
+
):
|
|
487
521
|
return None
|
|
488
522
|
|
|
489
523
|
if not self.cursor:
|
|
490
524
|
return None
|
|
491
525
|
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
526
|
+
if self.config.extract_partitions_using_create_statements:
|
|
527
|
+
try:
|
|
528
|
+
partitions = self._get_partitions_create_table(schema, table)
|
|
529
|
+
except Exception as e:
|
|
530
|
+
logger.warning(
|
|
531
|
+
f"Failed to get partitions from create table statement for {schema}.{table} because of {e}. Falling back to SQLAlchemy.",
|
|
532
|
+
exc_info=True,
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
# If we can't get create table statement, we fall back to SQLAlchemy
|
|
536
|
+
partitions = self._get_partitions_sqlalchemy(schema, table)
|
|
537
|
+
else:
|
|
538
|
+
partitions = self._get_partitions_sqlalchemy(schema, table)
|
|
495
539
|
|
|
496
|
-
partitions = []
|
|
497
|
-
for key in metadata.partition_keys:
|
|
498
|
-
if key.name:
|
|
499
|
-
partitions.append(key.name)
|
|
500
540
|
if not partitions:
|
|
501
541
|
return []
|
|
502
542
|
|
|
543
|
+
if (
|
|
544
|
+
not self.config.profiling.enabled
|
|
545
|
+
or not self.config.profiling.partition_profiling_enabled
|
|
546
|
+
):
|
|
547
|
+
return partitions
|
|
548
|
+
|
|
503
549
|
with self.report.report_exc(
|
|
504
550
|
message="Failed to extract partition details",
|
|
505
551
|
context=f"{schema}.{table}",
|
|
@@ -525,6 +571,56 @@ class AthenaSource(SQLAlchemySource):
|
|
|
525
571
|
|
|
526
572
|
return partitions
|
|
527
573
|
|
|
574
|
+
def _get_partitions_create_table(self, schema: str, table: str) -> List[str]:
|
|
575
|
+
assert self.cursor
|
|
576
|
+
try:
|
|
577
|
+
res = self.cursor.execute(f"SHOW CREATE TABLE `{schema}`.`{table}`")
|
|
578
|
+
except Exception as e:
|
|
579
|
+
# Athena does not support SHOW CREATE TABLE for views
|
|
580
|
+
# and will throw an error. We need to handle this case
|
|
581
|
+
# and caller needs to fallback to sqlalchemy's get partitions call.
|
|
582
|
+
logger.debug(
|
|
583
|
+
f"Failed to get table properties for {schema}.{table}: {e}",
|
|
584
|
+
exc_info=True,
|
|
585
|
+
)
|
|
586
|
+
raise e
|
|
587
|
+
rows = res.fetchall()
|
|
588
|
+
|
|
589
|
+
# Concatenate all rows into a single string with newlines
|
|
590
|
+
create_table_statement = "\n".join(row[0] for row in rows)
|
|
591
|
+
|
|
592
|
+
try:
|
|
593
|
+
athena_table_info = AthenaPropertiesExtractor.get_table_properties(
|
|
594
|
+
create_table_statement
|
|
595
|
+
)
|
|
596
|
+
except Exception as e:
|
|
597
|
+
logger.debug(
|
|
598
|
+
f"Failed to parse table properties for {schema}.{table}: {e} and statement: {create_table_statement}",
|
|
599
|
+
exc_info=True,
|
|
600
|
+
)
|
|
601
|
+
raise e
|
|
602
|
+
|
|
603
|
+
partitions = []
|
|
604
|
+
if (
|
|
605
|
+
athena_table_info.partition_info
|
|
606
|
+
and athena_table_info.partition_info.simple_columns
|
|
607
|
+
):
|
|
608
|
+
partitions = [
|
|
609
|
+
ci.name for ci in athena_table_info.partition_info.simple_columns
|
|
610
|
+
]
|
|
611
|
+
return partitions
|
|
612
|
+
|
|
613
|
+
def _get_partitions_sqlalchemy(self, schema: str, table: str) -> List[str]:
|
|
614
|
+
assert self.cursor
|
|
615
|
+
metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
|
|
616
|
+
table_name=table, schema_name=schema
|
|
617
|
+
)
|
|
618
|
+
partitions = []
|
|
619
|
+
for key in metadata.partition_keys:
|
|
620
|
+
if key.name:
|
|
621
|
+
partitions.append(key.name)
|
|
622
|
+
return partitions
|
|
623
|
+
|
|
528
624
|
# Overwrite to modify the creation of schema fields
|
|
529
625
|
def get_schema_fields_for_column(
|
|
530
626
|
self,
|
|
@@ -551,6 +647,18 @@ class AthenaSource(SQLAlchemySource):
|
|
|
551
647
|
),
|
|
552
648
|
)
|
|
553
649
|
|
|
650
|
+
# Keeping it as individual check to make it more explicit and easier to understand
|
|
651
|
+
if not self.config.emit_schema_fieldpaths_as_v1:
|
|
652
|
+
return fields
|
|
653
|
+
|
|
654
|
+
if isinstance(
|
|
655
|
+
fields[0].type.type, (RecordTypeClass, MapTypeClass, ArrayTypeClass)
|
|
656
|
+
):
|
|
657
|
+
return fields
|
|
658
|
+
else:
|
|
659
|
+
fields[0].fieldPath = get_simple_field_path_from_v2_field_path(
|
|
660
|
+
fields[0].fieldPath
|
|
661
|
+
)
|
|
554
662
|
return fields
|
|
555
663
|
|
|
556
664
|
def generate_partition_profiler_query(
|