acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,7 @@ from sqlalchemy.engine.reflection import Inspector
16
16
  from sqlalchemy.types import TypeEngine
17
17
  from sqlalchemy_bigquery import STRUCT
18
18
 
19
+ from datahub.configuration.common import HiddenFromDocs
19
20
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
20
21
  from datahub.emitter.mcp_builder import ContainerKey, DatabaseKey
21
22
  from datahub.ingestion.api.decorators import (
@@ -29,8 +30,14 @@ from datahub.ingestion.api.decorators import (
29
30
  from datahub.ingestion.api.source import StructuredLogLevel
30
31
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
32
  from datahub.ingestion.source.aws.s3_util import make_s3_urn
32
- from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
33
+ from datahub.ingestion.source.common.subtypes import (
34
+ DatasetContainerSubTypes,
35
+ SourceCapabilityModifier,
36
+ )
33
37
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
38
+ from datahub.ingestion.source.sql.athena_properties_extractor import (
39
+ AthenaPropertiesExtractor,
40
+ )
34
41
  from datahub.ingestion.source.sql.sql_common import (
35
42
  SQLAlchemySource,
36
43
  register_custom_type,
@@ -44,12 +51,17 @@ from datahub.ingestion.source.sql.sql_utils import (
44
51
  )
45
52
  from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
46
53
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
47
- from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
54
+ from datahub.metadata.schema_classes import (
55
+ ArrayTypeClass,
56
+ MapTypeClass,
57
+ RecordTypeClass,
58
+ )
48
59
  from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
49
60
  from datahub.utilities.sqlalchemy_type_converter import (
50
61
  MapType,
51
62
  get_schema_fields_for_sqlalchemy_column,
52
63
  )
64
+ from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
53
65
 
54
66
  try:
55
67
  from typing_extensions import override
@@ -62,6 +74,11 @@ except ImportError:
62
74
 
63
75
  logger = logging.getLogger(__name__)
64
76
 
77
+ # Precompiled regex for SQL identifier validation
78
+ # Athena identifiers can only contain lowercase letters, numbers, underscore, and period (for complex types)
79
+ # Note: Athena automatically converts uppercase to lowercase, but we're being strict for security
80
+ _IDENTIFIER_PATTERN = re.compile(r"^[a-zA-Z0-9_.]+$")
81
+
65
82
  assert STRUCT, "required type modules are not available"
66
83
  register_custom_type(STRUCT, RecordTypeClass)
67
84
  register_custom_type(MapType, MapTypeClass)
@@ -235,7 +252,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
235
252
 
236
253
 
237
254
  class AthenaConfig(SQLCommonConfig):
238
- scheme: str = "awsathena+rest"
255
+ scheme: HiddenFromDocs[str] = "awsathena+rest"
239
256
  username: Optional[str] = pydantic.Field(
240
257
  default=None,
241
258
  description="Username credential. If not specified, detected with boto3 rules. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html",
@@ -281,12 +298,22 @@ class AthenaConfig(SQLCommonConfig):
281
298
  description="Extract partitions for tables. Partition extraction needs to run a query (`select * from table$partitions`) on the table. Disable this if you don't want to grant select permission.",
282
299
  )
283
300
 
301
+ extract_partitions_using_create_statements: bool = pydantic.Field(
302
+ default=False,
303
+ description="Extract partitions using the `SHOW CREATE TABLE` statement instead of querying the table's partitions directly. This needs to be enabled to extract Iceberg partitions. If extraction fails it falls back to the default partition extraction. This is experimental.",
304
+ )
305
+
284
306
  _s3_staging_dir_population = pydantic_renamed_field(
285
307
  old_name="s3_staging_dir",
286
308
  new_name="query_result_location",
287
309
  print_warning=True,
288
310
  )
289
311
 
312
+ emit_schema_fieldpaths_as_v1: bool = pydantic.Field(
313
+ default=False,
314
+ description="Convert simple field paths to DataHub field path v1 format. Simple column paths are those that do not contain any nested fields.",
315
+ )
316
+
290
317
  profiling: AthenaProfilingConfig = AthenaProfilingConfig()
291
318
 
292
319
  def get_sql_alchemy_url(self):
@@ -321,8 +348,24 @@ class Partitionitem:
321
348
  @capability(
322
349
  SourceCapability.DATA_PROFILING,
323
350
  "Optionally enabled via configuration. Profiling uses sql queries on whole table which can be expensive operation.",
351
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
352
+ )
353
+ @capability(
354
+ SourceCapability.LINEAGE_COARSE,
355
+ "Supported for S3 tables",
356
+ subtype_modifier=[
357
+ SourceCapabilityModifier.VIEW,
358
+ SourceCapabilityModifier.TABLE,
359
+ ],
360
+ )
361
+ @capability(
362
+ SourceCapability.LINEAGE_FINE,
363
+ "Supported for S3 tables",
364
+ subtype_modifier=[
365
+ SourceCapabilityModifier.VIEW,
366
+ SourceCapabilityModifier.TABLE,
367
+ ],
324
368
  )
325
- @capability(SourceCapability.LINEAGE_COARSE, "Supported for S3 tables")
326
369
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
327
370
  class AthenaSource(SQLAlchemySource):
328
371
  """
@@ -473,11 +516,70 @@ class AthenaSource(SQLAlchemySource):
473
516
  return [schema for schema in schemas if schema == athena_config.database]
474
517
  return schemas
475
518
 
519
+ @classmethod
520
+ def _sanitize_identifier(cls, identifier: str) -> str:
521
+ """Sanitize SQL identifiers to prevent injection attacks.
522
+
523
+ Args:
524
+ identifier: The SQL identifier to sanitize
525
+
526
+ Returns:
527
+ Sanitized identifier safe for SQL queries
528
+
529
+ Raises:
530
+ ValueError: If identifier contains unsafe characters
531
+ """
532
+ if not identifier:
533
+ raise ValueError("Identifier cannot be empty")
534
+
535
+ # Allow only alphanumeric characters, underscores, and periods for identifiers
536
+ # This matches Athena's identifier naming rules
537
+ if not _IDENTIFIER_PATTERN.match(identifier):
538
+ raise ValueError(
539
+ f"Identifier '{identifier}' contains unsafe characters. Only alphanumeric characters, underscores, and periods are allowed."
540
+ )
541
+
542
+ return identifier
543
+
476
544
  @classmethod
477
545
  def _casted_partition_key(cls, key: str) -> str:
478
546
  # We need to cast the partition keys to a VARCHAR, since otherwise
479
547
  # Athena may throw an error during concatenation / comparison.
480
- return f"CAST({key} as VARCHAR)"
548
+ sanitized_key = cls._sanitize_identifier(key)
549
+ return f"CAST({sanitized_key} as VARCHAR)"
550
+
551
+ @classmethod
552
+ def _build_max_partition_query(
553
+ cls, schema: str, table: str, partitions: List[str]
554
+ ) -> str:
555
+ """Build SQL query to find the row with maximum partition values.
556
+
557
+ Args:
558
+ schema: Database schema name
559
+ table: Table name
560
+ partitions: List of partition column names
561
+
562
+ Returns:
563
+ SQL query string to find the maximum partition
564
+
565
+ Raises:
566
+ ValueError: If any identifier contains unsafe characters
567
+ """
568
+ # Sanitize all identifiers to prevent SQL injection
569
+ sanitized_schema = cls._sanitize_identifier(schema)
570
+ sanitized_table = cls._sanitize_identifier(table)
571
+ sanitized_partitions = [
572
+ cls._sanitize_identifier(partition) for partition in partitions
573
+ ]
574
+
575
+ casted_keys = [cls._casted_partition_key(key) for key in partitions]
576
+ if len(casted_keys) == 1:
577
+ part_concat = casted_keys[0]
578
+ else:
579
+ separator = "CAST('-' AS VARCHAR)"
580
+ part_concat = f"CONCAT({f', {separator}, '.join(casted_keys)})"
581
+
582
+ return f'select {",".join(sanitized_partitions)} from "{sanitized_schema}"."{sanitized_table}$partitions" where {part_concat} = (select max({part_concat}) from "{sanitized_schema}"."{sanitized_table}$partitions")'
481
583
 
482
584
  @override
483
585
  def get_partitions(
@@ -489,27 +591,37 @@ class AthenaSource(SQLAlchemySource):
489
591
  if not self.cursor:
490
592
  return None
491
593
 
492
- metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
493
- table_name=table, schema_name=schema
494
- )
594
+ if self.config.extract_partitions_using_create_statements:
595
+ try:
596
+ partitions = self._get_partitions_create_table(schema, table)
597
+ except Exception as e:
598
+ logger.warning(
599
+ f"Failed to get partitions from create table statement for {schema}.{table} because of {e}. Falling back to SQLAlchemy.",
600
+ exc_info=True,
601
+ )
602
+
603
+ # If we can't get create table statement, we fall back to SQLAlchemy
604
+ partitions = self._get_partitions_sqlalchemy(schema, table)
605
+ else:
606
+ partitions = self._get_partitions_sqlalchemy(schema, table)
495
607
 
496
- partitions = []
497
- for key in metadata.partition_keys:
498
- if key.name:
499
- partitions.append(key.name)
500
608
  if not partitions:
501
609
  return []
502
610
 
611
+ if (
612
+ not self.config.profiling.enabled
613
+ or not self.config.profiling.partition_profiling_enabled
614
+ ):
615
+ return partitions
616
+
503
617
  with self.report.report_exc(
504
618
  message="Failed to extract partition details",
505
619
  context=f"{schema}.{table}",
506
620
  level=StructuredLogLevel.WARN,
507
621
  ):
508
- # We create an artifical concatenated partition key to be able to query max partition easier
509
- part_concat = " || '-' || ".join(
510
- self._casted_partition_key(key) for key in partitions
622
+ max_partition_query = self._build_max_partition_query(
623
+ schema, table, partitions
511
624
  )
512
- max_partition_query = f'select {",".join(partitions)} from "{schema}"."{table}$partitions" where {part_concat} = (select max({part_concat}) from "{schema}"."{table}$partitions")'
513
625
  ret = self.cursor.execute(max_partition_query)
514
626
  max_partition: Dict[str, str] = {}
515
627
  if ret:
@@ -525,6 +637,56 @@ class AthenaSource(SQLAlchemySource):
525
637
 
526
638
  return partitions
527
639
 
640
+ def _get_partitions_create_table(self, schema: str, table: str) -> List[str]:
641
+ assert self.cursor
642
+ try:
643
+ res = self.cursor.execute(f"SHOW CREATE TABLE `{schema}`.`{table}`")
644
+ except Exception as e:
645
+ # Athena does not support SHOW CREATE TABLE for views
646
+ # and will throw an error. We need to handle this case
647
+ # and caller needs to fallback to sqlalchemy's get partitions call.
648
+ logger.debug(
649
+ f"Failed to get table properties for {schema}.{table}: {e}",
650
+ exc_info=True,
651
+ )
652
+ raise e
653
+ rows = res.fetchall()
654
+
655
+ # Concatenate all rows into a single string with newlines
656
+ create_table_statement = "\n".join(row[0] for row in rows)
657
+
658
+ try:
659
+ athena_table_info = AthenaPropertiesExtractor.get_table_properties(
660
+ create_table_statement
661
+ )
662
+ except Exception as e:
663
+ logger.debug(
664
+ f"Failed to parse table properties for {schema}.{table}: {e} and statement: {create_table_statement}",
665
+ exc_info=True,
666
+ )
667
+ raise e
668
+
669
+ partitions = []
670
+ if (
671
+ athena_table_info.partition_info
672
+ and athena_table_info.partition_info.simple_columns
673
+ ):
674
+ partitions = [
675
+ ci.name for ci in athena_table_info.partition_info.simple_columns
676
+ ]
677
+ return partitions
678
+
679
+ def _get_partitions_sqlalchemy(self, schema: str, table: str) -> List[str]:
680
+ assert self.cursor
681
+ metadata: AthenaTableMetadata = self.cursor.get_table_metadata(
682
+ table_name=table, schema_name=schema
683
+ )
684
+ partitions = []
685
+ for key in metadata.partition_keys:
686
+ if key.name:
687
+ partitions.append(key.name)
688
+ return partitions
689
+
528
690
  # Overwrite to modify the creation of schema fields
529
691
  def get_schema_fields_for_column(
530
692
  self,
@@ -551,6 +713,18 @@ class AthenaSource(SQLAlchemySource):
551
713
  ),
552
714
  )
553
715
 
716
+ # Keeping it as individual check to make it more explicit and easier to understand
717
+ if not self.config.emit_schema_fieldpaths_as_v1:
718
+ return fields
719
+
720
+ if isinstance(
721
+ fields[0].type.type, (RecordTypeClass, MapTypeClass, ArrayTypeClass)
722
+ ):
723
+ return fields
724
+ else:
725
+ fields[0].fieldPath = get_simple_field_path_from_v2_field_path(
726
+ fields[0].fieldPath
727
+ )
554
728
  return fields
555
729
 
556
730
  def generate_partition_profiler_query(
@@ -564,16 +738,34 @@ class AthenaSource(SQLAlchemySource):
564
738
  ).get(table, None)
565
739
 
566
740
  if partition and partition.max_partition:
567
- max_partition_filters = []
568
- for key, value in partition.max_partition.items():
569
- max_partition_filters.append(
570
- f"{self._casted_partition_key(key)} = '{value}'"
741
+ try:
742
+ # Sanitize identifiers to prevent SQL injection
743
+ sanitized_schema = self._sanitize_identifier(schema)
744
+ sanitized_table = self._sanitize_identifier(table)
745
+
746
+ max_partition_filters = []
747
+ for key, value in partition.max_partition.items():
748
+ # Sanitize partition key and properly escape the value
749
+ sanitized_key = self._sanitize_identifier(key)
750
+ # Escape single quotes in the value to prevent injection
751
+ escaped_value = value.replace("'", "''") if value else ""
752
+ max_partition_filters.append(
753
+ f"{self._casted_partition_key(sanitized_key)} = '{escaped_value}'"
754
+ )
755
+ max_partition = str(partition.max_partition)
756
+ return (
757
+ max_partition,
758
+ f'SELECT * FROM "{sanitized_schema}"."{sanitized_table}" WHERE {" AND ".join(max_partition_filters)}',
571
759
  )
572
- max_partition = str(partition.max_partition)
573
- return (
574
- max_partition,
575
- f'SELECT * FROM "{schema}"."{table}" WHERE {" AND ".join(max_partition_filters)}',
576
- )
760
+ except ValueError as e:
761
+ # If sanitization fails due to malicious identifiers,
762
+ # return None to disable partition profiling for this table
763
+ # rather than crashing the entire ingestion
764
+ logger.warning(
765
+ f"Failed to generate partition profiler query for {schema}.{table} due to unsafe identifiers: {e}. "
766
+ f"Partition profiling disabled for this table."
767
+ )
768
+ return None, None
577
769
  return None, None
578
770
 
579
771
  def close(self):