acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
@@ -32,6 +32,7 @@ from datahub.ingestion.api.source import (
32
32
  )
33
33
  from datahub.ingestion.api.source_helpers import auto_workunit
34
34
  from datahub.ingestion.api.workunit import MetadataWorkUnit
35
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
35
36
  from datahub.ingestion.source.snowflake.constants import (
36
37
  GENERIC_PERMISSION_ERROR_KEY,
37
38
  SnowflakeEdition,
@@ -97,7 +98,14 @@ logger: logging.Logger = logging.getLogger(__name__)
97
98
  @support_status(SupportStatus.CERTIFIED)
98
99
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
99
100
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
100
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
101
+ @capability(
102
+ SourceCapability.CONTAINERS,
103
+ "Enabled by default",
104
+ subtype_modifier=[
105
+ SourceCapabilityModifier.DATABASE,
106
+ SourceCapabilityModifier.SCHEMA,
107
+ ],
108
+ )
101
109
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
102
110
  @capability(
103
111
  SourceCapability.DATA_PROFILING,
@@ -118,7 +126,7 @@ logger: logging.Logger = logging.getLogger(__name__)
118
126
  )
119
127
  @capability(
120
128
  SourceCapability.DELETION_DETECTION,
121
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
129
+ "Enabled by default via stateful ingestion",
122
130
  supported=True,
123
131
  )
124
132
  @capability(
@@ -131,6 +139,7 @@ logger: logging.Logger = logging.getLogger(__name__)
131
139
  "Optionally enabled via `classification.enabled`",
132
140
  supported=True,
133
141
  )
142
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
134
143
  class SnowflakeV2Source(
135
144
  SnowflakeCommonMixin,
136
145
  StatefulIngestionSourceBase,
@@ -311,6 +320,7 @@ class SnowflakeV2Source(
311
320
  SourceCapability.PLATFORM_INSTANCE,
312
321
  SourceCapability.DOMAINS,
313
322
  SourceCapability.DELETION_DETECTION,
323
+ SourceCapability.TEST_CONNECTION,
314
324
  )
315
325
  ]
316
326
 
@@ -575,6 +585,7 @@ class SnowflakeV2Source(
575
585
 
576
586
  queries_extractor = SnowflakeQueriesExtractor(
577
587
  connection=self.connection,
588
+ # TODO: this should be its own section in main recipe
578
589
  config=SnowflakeQueriesExtractorConfig(
579
590
  window=BaseTimeWindowConfig(
580
591
  start_time=self.config.start_time,
@@ -589,6 +600,9 @@ class SnowflakeV2Source(
589
600
  include_query_usage_statistics=self.config.include_query_usage_statistics,
590
601
  user_email_pattern=self.config.user_email_pattern,
591
602
  pushdown_deny_usernames=self.config.pushdown_deny_usernames,
603
+ query_dedup_strategy=self.config.query_dedup_strategy,
604
+ push_down_database_pattern_access_history=self.config.push_down_database_pattern_access_history,
605
+ additional_database_names_allowlist=self.config.additional_database_names_allowlist,
592
606
  ),
593
607
  structured_report=self.report,
594
608
  filters=self.filters,
@@ -0,0 +1,143 @@
1
+ import dataclasses
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from typing import Any, Iterable, List, Optional
5
+
6
+ from datahub.ingestion.api.closeable import Closeable
7
+ from datahub.metadata.urns import CorpUserUrn
8
+ from datahub.sql_parsing.sql_parsing_aggregator import (
9
+ PreparsedQuery,
10
+ UrnStr,
11
+ )
12
+ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
13
+ from datahub.utilities.file_backed_collections import FileBackedDict
14
+
15
+
16
+ @dataclasses.dataclass
17
+ class StoredProcCall:
18
+ snowflake_root_query_id: str
19
+
20
+ # Query text will typically be something like:
21
+ # "CALL SALES_FORECASTING.CUSTOMER_ANALYSIS_PROC();"
22
+ query_text: str
23
+
24
+ timestamp: datetime
25
+ user: CorpUserUrn
26
+ default_db: str
27
+ default_schema: str
28
+
29
+
30
+ @dataclass
31
+ class StoredProcExecutionLineage:
32
+ call: StoredProcCall
33
+
34
+ inputs: List[UrnStr]
35
+ outputs: List[UrnStr]
36
+
37
+
38
+ @dataclass
39
+ class StoredProcLineageReport:
40
+ num_stored_proc_calls: int = 0
41
+ num_related_queries: int = 0
42
+ num_related_queries_without_proc_call: int = 0
43
+
44
+ # Incremented at generation/build time.
45
+ num_stored_proc_lineage_entries: int = 0
46
+ num_stored_proc_calls_with_no_inputs: int = 0
47
+ num_stored_proc_calls_with_no_outputs: int = 0
48
+
49
+
50
+ class StoredProcLineageTracker(Closeable):
51
+ """
52
+ Tracks table-level lineage for Snowflake stored procedures.
53
+
54
+ Stored procedures in Snowflake trigger multiple SQL queries during execution.
55
+ Snowflake assigns each stored procedure call a unique query_id and uses this as the
56
+ root_query_id for all subsequent queries executed within that procedure. This allows
57
+ us to trace which queries belong to a specific stored procedure execution and build
58
+ table-level lineage by aggregating inputs/outputs from all related queries.
59
+ """
60
+
61
+ def __init__(self, platform: str, shared_connection: Optional[Any] = None):
62
+ self.platform = platform
63
+ self.report = StoredProcLineageReport()
64
+
65
+ # { root_query_id -> StoredProcExecutionLineage }
66
+ self._stored_proc_execution_lineage: FileBackedDict[
67
+ StoredProcExecutionLineage
68
+ ] = FileBackedDict(shared_connection, tablename="stored_proc_lineage")
69
+
70
+ def add_stored_proc_call(self, call: StoredProcCall) -> None:
71
+ """Add a stored procedure call to track."""
72
+ self._stored_proc_execution_lineage[call.snowflake_root_query_id] = (
73
+ StoredProcExecutionLineage(
74
+ call=call,
75
+ # Will be populated by subsequent queries.
76
+ inputs=[],
77
+ outputs=[],
78
+ )
79
+ )
80
+ self.report.num_stored_proc_calls += 1
81
+
82
+ def add_related_query(self, query: PreparsedQuery) -> bool:
83
+ """Add a query that might be related to a stored procedure execution.
84
+
85
+ Returns True if the query was added to a stored procedure execution, False otherwise.
86
+ """
87
+ snowflake_root_query_id = (query.extra_info or {}).get(
88
+ "snowflake_root_query_id"
89
+ )
90
+
91
+ if snowflake_root_query_id:
92
+ if snowflake_root_query_id not in self._stored_proc_execution_lineage:
93
+ self.report.num_related_queries_without_proc_call += 1
94
+ return False
95
+
96
+ stored_proc_execution = self._stored_proc_execution_lineage.for_mutation(
97
+ snowflake_root_query_id
98
+ )
99
+ stored_proc_execution.inputs.extend(query.upstreams)
100
+ if query.downstream is not None:
101
+ stored_proc_execution.outputs.append(query.downstream)
102
+ self.report.num_related_queries += 1
103
+ return True
104
+
105
+ return False
106
+
107
+ def build_merged_lineage_entries(self) -> Iterable[PreparsedQuery]:
108
+ # For stored procedures, we can only get table-level lineage from the audit log.
109
+ # We represent these as PreparsedQuery objects for now. Eventually we'll want to
110
+ # create dataJobInputOutput lineage instead.
111
+
112
+ for stored_proc_execution in self._stored_proc_execution_lineage.values():
113
+ if not stored_proc_execution.inputs:
114
+ self.report.num_stored_proc_calls_with_no_inputs += 1
115
+ continue
116
+
117
+ if not stored_proc_execution.outputs:
118
+ self.report.num_stored_proc_calls_with_no_outputs += 1
119
+ # Still continue to generate lineage for cases where we have inputs but no outputs
120
+
121
+ for downstream in stored_proc_execution.outputs:
122
+ stored_proc_query_id = get_query_fingerprint(
123
+ stored_proc_execution.call.query_text,
124
+ self.platform,
125
+ fast=True,
126
+ secondary_id=downstream,
127
+ )
128
+
129
+ lineage_entry = PreparsedQuery(
130
+ query_id=stored_proc_query_id,
131
+ query_text=stored_proc_execution.call.query_text,
132
+ upstreams=stored_proc_execution.inputs,
133
+ downstream=downstream,
134
+ query_count=0,
135
+ user=stored_proc_execution.call.user,
136
+ timestamp=stored_proc_execution.call.timestamp,
137
+ )
138
+
139
+ self.report.num_stored_proc_lineage_entries += 1
140
+ yield lineage_entry
141
+
142
+ def close(self) -> None:
143
+ self._stored_proc_execution_lineage.close()
@@ -29,8 +29,14 @@ from datahub.ingestion.api.decorators import (
29
29
  from datahub.ingestion.api.source import StructuredLogLevel
30
30
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
31
  from datahub.ingestion.source.aws.s3_util import make_s3_urn
32
- from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
32
+ from datahub.ingestion.source.common.subtypes import (
33
+ DatasetContainerSubTypes,
34
+ SourceCapabilityModifier,
35
+ )
33
36
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
37
+ from datahub.ingestion.source.sql.athena_properties_extractor import (
38
+ AthenaPropertiesExtractor,
39
+ )
34
40
  from datahub.ingestion.source.sql.sql_common import (
35
41
  SQLAlchemySource,
36
42
  register_custom_type,
@@ -44,12 +50,17 @@ from datahub.ingestion.source.sql.sql_utils import (
44
50
  )
45
51
  from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
46
52
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
47
- from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
53
+ from datahub.metadata.schema_classes import (
54
+ ArrayTypeClass,
55
+ MapTypeClass,
56
+ RecordTypeClass,
57
+ )
48
58
  from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
49
59
  from datahub.utilities.sqlalchemy_type_converter import (
50
60
  MapType,
51
61
  get_schema_fields_for_sqlalchemy_column,
52
62
  )
63
+ from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
53
64
 
54
65
  try:
55
66
  from typing_extensions import override
@@ -281,12 +292,22 @@ class AthenaConfig(SQLCommonConfig):
281
292
  description="Extract partitions for tables. Partition extraction needs to run a query (`select * from table$partitions`) on the table. Disable this if you don't want to grant select permission.",
282
293
  )
283
294
 
295
+ extract_partitions_using_create_statements: bool = pydantic.Field(
296
+ default=False,
297
+ description="Extract partitions using the `SHOW CREATE TABLE` statement instead of querying the table's partitions directly. This needs to be enabled to extract Iceberg partitions. If extraction fails it falls back to the default partition extraction. This is experimental.",
298
+ )
299
+
284
300
  _s3_staging_dir_population = pydantic_renamed_field(
285
301
  old_name="s3_staging_dir",
286
302
  new_name="query_result_location",
287
303
  print_warning=True,
288
304
  )
289
305
 
306
+ emit_schema_fieldpaths_as_v1: bool = pydantic.Field(
307
+ default=False,
308
+ description="Convert simple field paths to DataHub field path v1 format. Simple column paths are those that do not contain any nested fields.",
309
+ )
310
+
290
311
  profiling: AthenaProfilingConfig = AthenaProfilingConfig()
291
312
 
292
313
  def get_sql_alchemy_url(self):
@@ -321,8 +342,18 @@ class Partitionitem:
321
342
  @capability(
322
343
  SourceCapability.DATA_PROFILING,
323
344
  "Optionally enabled via configuration. Profiling uses sql queries on whole table which can be expensive operation.",
345
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
346
+ )
347
+ @capability(
348
+ SourceCapability.LINEAGE_COARSE,
349
+ "Supported for S3 tables",
350
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
351
+ )
352
+ @capability(
353
+ SourceCapability.LINEAGE_FINE,
354
+ "Supported for S3 tables",
355
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
324
356
  )
325
- @capability(SourceCapability.LINEAGE_COARSE, "Supported for S3 tables")
326
357
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
327
358
  class AthenaSource(SQLAlchemySource):
328
359
  """
@@ -483,23 +514,38 @@ class AthenaSource(SQLAlchemySource):
483
514
  def get_partitions(
484
515
  self, inspector: Inspector, schema: str, table: str
485
516
  ) -> Optional[List[str]]:
486
- if not self.config.extract_partitions:
517
+ if (
518
+ not self.config.extract_partitions
519
+ and not self.config.extract_partitions_using_create_statements
520
+ ):
487
521
  return None
488
522
 
489
523
  if not self.cursor:
490
524
  return None
491
525
 
492
- metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
493
- table_name=table, schema_name=schema
494
- )
526
+ if self.config.extract_partitions_using_create_statements:
527
+ try:
528
+ partitions = self._get_partitions_create_table(schema, table)
529
+ except Exception as e:
530
+ logger.warning(
531
+ f"Failed to get partitions from create table statement for {schema}.{table} because of {e}. Falling back to SQLAlchemy.",
532
+ exc_info=True,
533
+ )
534
+
535
+ # If we can't get create table statement, we fall back to SQLAlchemy
536
+ partitions = self._get_partitions_sqlalchemy(schema, table)
537
+ else:
538
+ partitions = self._get_partitions_sqlalchemy(schema, table)
495
539
 
496
- partitions = []
497
- for key in metadata.partition_keys:
498
- if key.name:
499
- partitions.append(key.name)
500
540
  if not partitions:
501
541
  return []
502
542
 
543
+ if (
544
+ not self.config.profiling.enabled
545
+ or not self.config.profiling.partition_profiling_enabled
546
+ ):
547
+ return partitions
548
+
503
549
  with self.report.report_exc(
504
550
  message="Failed to extract partition details",
505
551
  context=f"{schema}.{table}",
@@ -525,6 +571,56 @@ class AthenaSource(SQLAlchemySource):
525
571
 
526
572
  return partitions
527
573
 
574
+ def _get_partitions_create_table(self, schema: str, table: str) -> List[str]:
575
+ assert self.cursor
576
+ try:
577
+ res = self.cursor.execute(f"SHOW CREATE TABLE `{schema}`.`{table}`")
578
+ except Exception as e:
579
+ # Athena does not support SHOW CREATE TABLE for views
580
+ # and will throw an error. We need to handle this case
581
+ # and caller needs to fallback to sqlalchemy's get partitions call.
582
+ logger.debug(
583
+ f"Failed to get table properties for {schema}.{table}: {e}",
584
+ exc_info=True,
585
+ )
586
+ raise e
587
+ rows = res.fetchall()
588
+
589
+ # Concatenate all rows into a single string with newlines
590
+ create_table_statement = "\n".join(row[0] for row in rows)
591
+
592
+ try:
593
+ athena_table_info = AthenaPropertiesExtractor.get_table_properties(
594
+ create_table_statement
595
+ )
596
+ except Exception as e:
597
+ logger.debug(
598
+ f"Failed to parse table properties for {schema}.{table}: {e} and statement: {create_table_statement}",
599
+ exc_info=True,
600
+ )
601
+ raise e
602
+
603
+ partitions = []
604
+ if (
605
+ athena_table_info.partition_info
606
+ and athena_table_info.partition_info.simple_columns
607
+ ):
608
+ partitions = [
609
+ ci.name for ci in athena_table_info.partition_info.simple_columns
610
+ ]
611
+ return partitions
612
+
613
+ def _get_partitions_sqlalchemy(self, schema: str, table: str) -> List[str]:
614
+ assert self.cursor
615
+ metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
616
+ table_name=table, schema_name=schema
617
+ )
618
+ partitions = []
619
+ for key in metadata.partition_keys:
620
+ if key.name:
621
+ partitions.append(key.name)
622
+ return partitions
623
+
528
624
  # Overwrite to modify the creation of schema fields
529
625
  def get_schema_fields_for_column(
530
626
  self,
@@ -551,6 +647,18 @@ class AthenaSource(SQLAlchemySource):
551
647
  ),
552
648
  )
553
649
 
650
+ # Keeping it as individual check to make it more explicit and easier to understand
651
+ if not self.config.emit_schema_fieldpaths_as_v1:
652
+ return fields
653
+
654
+ if isinstance(
655
+ fields[0].type.type, (RecordTypeClass, MapTypeClass, ArrayTypeClass)
656
+ ):
657
+ return fields
658
+ else:
659
+ fields[0].fieldPath = get_simple_field_path_from_v2_field_path(
660
+ fields[0].fieldPath
661
+ )
554
662
  return fields
555
663
 
556
664
  def generate_partition_profiler_query(