acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ import concurrent.futures
5
5
  import contextlib
6
6
  import dataclasses
7
7
  import functools
8
+ import importlib.metadata
8
9
  import json
9
10
  import logging
10
11
  import re
@@ -51,6 +52,7 @@ from typing_extensions import Concatenate, ParamSpec
51
52
  from datahub.emitter import mce_builder
52
53
  from datahub.emitter.mce_builder import get_sys_time
53
54
  from datahub.ingestion.graph.client import get_default_graph
55
+ from datahub.ingestion.graph.config import ClientMode
54
56
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
55
57
  from datahub.ingestion.source.profiling.common import (
56
58
  Cardinality,
@@ -83,6 +85,30 @@ if TYPE_CHECKING:
83
85
  from pyathena.cursor import Cursor
84
86
 
85
87
  assert MARKUPSAFE_PATCHED
88
+
89
+ # We need to ensure that acryl-great-expectations is installed
90
+ # and great-expectations is not installed.
91
+ try:
92
+ acryl_gx_version = bool(importlib.metadata.distribution("acryl-great-expectations"))
93
+ except importlib.metadata.PackageNotFoundError:
94
+ acryl_gx_version = False
95
+
96
+ try:
97
+ original_gx_version = bool(importlib.metadata.distribution("great-expectations"))
98
+ except importlib.metadata.PackageNotFoundError:
99
+ original_gx_version = False
100
+
101
+ if acryl_gx_version and original_gx_version:
102
+ raise RuntimeError(
103
+ "acryl-great-expectations and great-expectations cannot both be installed because their files will conflict. "
104
+ "You will need to (1) uninstall great-expectations and (2) re-install acryl-great-expectations. "
105
+ "See https://github.com/pypa/pip/issues/4625."
106
+ )
107
+ elif original_gx_version:
108
+ raise RuntimeError(
109
+ "We expect acryl-great-expectations to be installed, but great-expectations is installed instead."
110
+ )
111
+
86
112
  logger: logging.Logger = logging.getLogger(__name__)
87
113
 
88
114
  _original_get_column_median = SqlAlchemyDataset.get_column_median
@@ -94,7 +120,6 @@ SNOWFLAKE = "snowflake"
94
120
  BIGQUERY = "bigquery"
95
121
  REDSHIFT = "redshift"
96
122
  DATABRICKS = "databricks"
97
- TRINO = "trino"
98
123
 
99
124
  # Type names for Databricks, to match Title Case types in sqlalchemy
100
125
  ProfilerTypeMapping.INT_TYPE_NAMES.append("Integer")
@@ -180,6 +205,25 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
180
205
  )
181
206
  )
182
207
  return convert_to_json_serializable(element_values.fetchone()[0])
208
+ elif (
209
+ self.engine.dialect.name.lower() == GXSqlDialect.AWSATHENA
210
+ or self.engine.dialect.name.lower() == GXSqlDialect.TRINO
211
+ ):
212
+ return convert_to_json_serializable(
213
+ self.engine.execute(
214
+ sa.select(sa.func.approx_distinct(sa.column(column))).select_from(
215
+ self._table
216
+ )
217
+ ).scalar()
218
+ )
219
+ elif self.engine.dialect.name.lower() == DATABRICKS:
220
+ return convert_to_json_serializable(
221
+ self.engine.execute(
222
+ sa.select(sa.func.approx_count_distinct(sa.column(column))).select_from(
223
+ self._table
224
+ )
225
+ ).scalar()
226
+ )
183
227
  return convert_to_json_serializable(
184
228
  self.engine.execute(
185
229
  sa.select([sa.func.count(sa.func.distinct(sa.column(column)))]).select_from(
@@ -263,7 +307,6 @@ def _is_single_row_query_method(query: Any) -> bool:
263
307
  "get_column_max",
264
308
  "get_column_mean",
265
309
  "get_column_stdev",
266
- "get_column_nonnull_count",
267
310
  "get_column_unique_count",
268
311
  }
269
312
  CONSTANT_ROW_QUERY_METHODS = {
@@ -287,6 +330,7 @@ def _is_single_row_query_method(query: Any) -> bool:
287
330
 
288
331
  FIRST_PARTY_SINGLE_ROW_QUERY_METHODS = {
289
332
  "get_column_unique_count_dh_patch",
333
+ "_get_column_cardinality",
290
334
  }
291
335
 
292
336
  # We'll do this the inefficient way since the arrays are pretty small.
@@ -453,7 +497,20 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
453
497
  self, column_spec: _SingleColumnSpec, column: str
454
498
  ) -> None:
455
499
  try:
456
- nonnull_count = self.dataset.get_column_nonnull_count(column)
500
+ # Don't use Great Expectations get_column_nonnull_count because it
501
+ # generates this SQL:
502
+ #
503
+ # sum(CASE WHEN (mycolumn IN (NULL) OR mycolumn IS NULL) THEN 1 ELSE 0 END)
504
+ #
505
+ # which fails for complex types (such as Databricks maps) that don't
506
+ # support the IN operator.
507
+ nonnull_count = convert_to_json_serializable(
508
+ self.dataset.engine.execute(
509
+ sa.select(sa.func.count(sa.column(column))).select_from(
510
+ self.dataset._table
511
+ )
512
+ ).scalar()
513
+ )
457
514
  column_spec.nonnull_count = nonnull_count
458
515
  except Exception as e:
459
516
  logger.debug(
@@ -602,7 +659,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
602
659
  if not self.config.include_field_median_value:
603
660
  return
604
661
  try:
605
- if self.dataset.engine.dialect.name.lower() in [SNOWFLAKE, DATABRICKS]:
662
+ if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
606
663
  column_profile.median = str(
607
664
  self.dataset.engine.execute(
608
665
  sa.select([sa.func.median(sa.column(column))]).select_from(
@@ -610,6 +667,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
610
667
  )
611
668
  ).scalar()
612
669
  )
670
+ elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
671
+ column_profile.median = str(
672
+ self.dataset.engine.execute(
673
+ sa.select(
674
+ sa.text(
675
+ f"approx_percentile(`{column}`, 0.5) as approx_median"
676
+ )
677
+ ).select_from(self.dataset._table)
678
+ ).scalar()
679
+ )
613
680
  elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
614
681
  column_profile.median = str(
615
682
  self.dataset.engine.execute(
@@ -698,11 +765,41 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
698
765
  def _get_dataset_column_distinct_value_frequencies(
699
766
  self, column_profile: DatasetFieldProfileClass, column: str
700
767
  ) -> None:
701
- if self.config.include_field_distinct_value_frequencies:
768
+ if not self.config.include_field_distinct_value_frequencies:
769
+ return
770
+ try:
771
+ results = self.dataset.engine.execute(
772
+ sa.select(
773
+ [
774
+ sa.column(column),
775
+ sa.func.count(sa.column(column)),
776
+ ]
777
+ )
778
+ .select_from(self.dataset._table)
779
+ .where(sa.column(column).is_not(None))
780
+ .group_by(sa.column(column))
781
+ ).fetchall()
782
+
702
783
  column_profile.distinctValueFrequencies = [
703
- ValueFrequencyClass(value=str(value), frequency=count)
704
- for value, count in self.dataset.get_column_value_counts(column).items()
784
+ ValueFrequencyClass(value=str(value), frequency=int(count))
785
+ for value, count in results
705
786
  ]
787
+ # sort so output is deterministic. don't do it in SQL because not all column
788
+ # types are sortable in SQL (such as JSON data types on Athena/Trino).
789
+ column_profile.distinctValueFrequencies = sorted(
790
+ column_profile.distinctValueFrequencies, key=lambda x: x.value
791
+ )
792
+ except Exception as e:
793
+ logger.debug(
794
+ f"Caught exception while attempting to get distinct value frequencies for column {column}. {e}"
795
+ )
796
+
797
+ self.report.report_warning(
798
+ title="Profiling: Unable to Calculate Distinct Value Frequencies",
799
+ message="Distinct value frequencies for the column will not be accessible",
800
+ context=f"{self.dataset_name}.{column}",
801
+ exc=e,
802
+ )
706
803
 
707
804
  @_run_with_query_combiner
708
805
  def _get_dataset_column_histogram(
@@ -1137,26 +1234,34 @@ class DatahubGEProfiler:
1137
1234
  f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
1138
1235
  )
1139
1236
 
1140
- with PerfTimer() as timer, unittest.mock.patch(
1141
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
1142
- get_column_unique_count_dh_patch,
1143
- ), unittest.mock.patch(
1144
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
1145
- _get_column_quantiles_bigquery_patch,
1146
- ), unittest.mock.patch(
1147
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
1148
- _get_column_quantiles_awsathena_patch,
1149
- ), unittest.mock.patch(
1150
- "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
1151
- _get_column_median_patch,
1152
- ), concurrent.futures.ThreadPoolExecutor(
1153
- max_workers=max_workers
1154
- ) as async_executor, SQLAlchemyQueryCombiner(
1155
- enabled=self.config.query_combiner_enabled,
1156
- catch_exceptions=self.config.catch_exceptions,
1157
- is_single_row_query_method=_is_single_row_query_method,
1158
- serial_execution_fallback_enabled=True,
1159
- ).activate() as query_combiner:
1237
+ with (
1238
+ PerfTimer() as timer,
1239
+ unittest.mock.patch(
1240
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
1241
+ get_column_unique_count_dh_patch,
1242
+ ),
1243
+ unittest.mock.patch(
1244
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
1245
+ _get_column_quantiles_bigquery_patch,
1246
+ ),
1247
+ unittest.mock.patch(
1248
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
1249
+ _get_column_quantiles_awsathena_patch,
1250
+ ),
1251
+ unittest.mock.patch(
1252
+ "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
1253
+ _get_column_median_patch,
1254
+ ),
1255
+ concurrent.futures.ThreadPoolExecutor(
1256
+ max_workers=max_workers
1257
+ ) as async_executor,
1258
+ SQLAlchemyQueryCombiner(
1259
+ enabled=self.config.query_combiner_enabled,
1260
+ catch_exceptions=self.config.catch_exceptions,
1261
+ is_single_row_query_method=_is_single_row_query_method,
1262
+ serial_execution_fallback_enabled=True,
1263
+ ).activate() as query_combiner,
1264
+ ):
1160
1265
  # Submit the profiling requests to the thread pool executor.
1161
1266
  async_profiles = collections.deque(
1162
1267
  async_executor.submit(
@@ -1359,12 +1464,12 @@ class DatahubGEProfiler:
1359
1464
  )
1360
1465
  return None
1361
1466
  finally:
1362
- if batch is not None and self.base_engine.engine.name.upper() in [
1363
- "TRINO",
1364
- "AWSATHENA",
1467
+ if batch is not None and self.base_engine.engine.name.lower() in [
1468
+ GXSqlDialect.TRINO,
1469
+ GXSqlDialect.AWSATHENA,
1365
1470
  ]:
1366
1471
  if (
1367
- self.base_engine.engine.name.upper() == "TRINO"
1472
+ self.base_engine.engine.name.lower() == GXSqlDialect.TRINO
1368
1473
  or temp_view is not None
1369
1474
  ):
1370
1475
  self._drop_temp_table(batch)
@@ -1413,9 +1518,17 @@ class DatahubGEProfiler:
1413
1518
  logger.error(
1414
1519
  f"Unexpected {pretty_name} while profiling. Should have 3 parts but has {len(name_parts)} parts."
1415
1520
  )
1521
+ if platform == DATABRICKS:
1522
+ # TODO: Review logic for BigQuery as well, probably project.dataset.table should be quoted there as well
1523
+ quoted_name = ".".join(
1524
+ batch.engine.dialect.identifier_preparer.quote(part)
1525
+ for part in name_parts
1526
+ )
1527
+ batch._table = sa.text(quoted_name)
1528
+ logger.debug(f"Setting quoted table name to be {batch._table}")
1416
1529
  # If we only have two parts that means the project_id is missing from the table name and we add it
1417
1530
  # Temp tables has 3 parts while normal tables only has 2 parts
1418
- if len(str(batch._table).split(".")) == 2:
1531
+ elif len(str(batch._table).split(".")) == 2:
1419
1532
  batch._table = sa.text(f"{name_parts[0]}.{str(batch._table)}")
1420
1533
  logger.debug(f"Setting table name to be {batch._table}")
1421
1534
 
@@ -1559,7 +1672,7 @@ def _get_columns_to_ignore_sampling(
1559
1672
  name=dataset_name, platform=platform, env=env
1560
1673
  )
1561
1674
 
1562
- datahub_graph = get_default_graph()
1675
+ datahub_graph = get_default_graph(ClientMode.INGESTION)
1563
1676
 
1564
1677
  dataset_tags = datahub_graph.get_tags(dataset_urn)
1565
1678
  if dataset_tags:
@@ -1,12 +1,12 @@
1
1
  import datetime
2
2
  import logging
3
3
  import os
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Annotated, Any, Dict, List, Optional
5
5
 
6
6
  import pydantic
7
7
  from pydantic.fields import Field
8
8
 
9
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
9
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, SupportedSources
10
10
  from datahub.ingestion.source_config.operation_config import OperationConfig
11
11
 
12
12
  _PROFILING_FLAGS_TO_REPORT = {
@@ -120,28 +120,37 @@ class GEProfilingConfig(GEProfilingBaseConfig):
120
120
  "number of columns to profile goes up.",
121
121
  )
122
122
 
123
- profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
123
+ profile_if_updated_since_days: Annotated[
124
+ Optional[pydantic.PositiveFloat], SupportedSources(["snowflake", "bigquery"])
125
+ ] = Field(
124
126
  default=None,
125
127
  description="Profile table only if it has been updated since these many number of days. "
126
128
  "If set to `null`, no constraint of last modified time for tables to profile. "
127
129
  "Supported only in `snowflake` and `BigQuery`.",
128
130
  )
129
131
 
130
- profile_table_size_limit: Optional[int] = Field(
132
+ profile_table_size_limit: Annotated[
133
+ Optional[int],
134
+ SupportedSources(["snowflake", "bigquery", "unity-catalog", "oracle"]),
135
+ ] = Field(
131
136
  default=5,
132
137
  description="Profile tables only if their size is less than specified GBs. If set to `null`, "
133
138
  "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
134
139
  "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
135
140
  )
136
141
 
137
- profile_table_row_limit: Optional[int] = Field(
142
+ profile_table_row_limit: Annotated[
143
+ Optional[int], SupportedSources(["snowflake", "bigquery", "oracle"])
144
+ ] = Field(
138
145
  default=5000000,
139
146
  description="Profile tables only if their row count is less than specified count. "
140
147
  "If set to `null`, no limit on the row count of tables to profile. Supported only in "
141
148
  "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
142
149
  )
143
150
 
144
- profile_table_row_count_estimate_only: bool = Field(
151
+ profile_table_row_count_estimate_only: Annotated[
152
+ bool, SupportedSources(["postgres", "mysql"])
153
+ ] = Field(
145
154
  default=False,
146
155
  description="Use an approximate query for row count. This will be much faster but slightly "
147
156
  "less accurate. Only supported for Postgres and MySQL. ",
@@ -157,29 +166,35 @@ class GEProfilingConfig(GEProfilingBaseConfig):
157
166
  # Hidden option - used for debugging purposes.
158
167
  catch_exceptions: bool = Field(default=True, description="")
159
168
 
160
- partition_profiling_enabled: bool = Field(
169
+ partition_profiling_enabled: Annotated[
170
+ bool, SupportedSources(["athena", "bigquery"])
171
+ ] = Field(
161
172
  default=True,
162
173
  description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
163
174
  "If enabled, latest partition data is used for profiling.",
164
175
  )
165
- partition_datetime: Optional[datetime.datetime] = Field(
176
+ partition_datetime: Annotated[
177
+ Optional[datetime.datetime], SupportedSources(["bigquery"])
178
+ ] = Field(
166
179
  default=None,
167
180
  description="If specified, profile only the partition which matches this datetime. "
168
181
  "If not specified, profile the latest partition. Only Bigquery supports this.",
169
182
  )
170
- use_sampling: bool = Field(
183
+ use_sampling: Annotated[bool, SupportedSources(["bigquery", "snowflake"])] = Field(
171
184
  default=True,
172
185
  description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
173
186
  "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
174
187
  )
175
188
 
176
- sample_size: int = Field(
189
+ sample_size: Annotated[int, SupportedSources(["bigquery", "snowflake"])] = Field(
177
190
  default=10000,
178
191
  description="Number of rows to be sampled from table for column level profiling."
179
192
  "Applicable only if `use_sampling` is set to True.",
180
193
  )
181
194
 
182
- profile_external_tables: bool = Field(
195
+ profile_external_tables: Annotated[
196
+ bool, SupportedSources(["redshift", "snowflake"])
197
+ ] = Field(
183
198
  default=False,
184
199
  description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
185
200
  )
@@ -0,0 +1,272 @@
1
+ from typing import Dict, List, Optional, Tuple
2
+
3
+ from datahub.emitter.mce_builder import (
4
+ make_chart_urn,
5
+ make_dashboard_urn,
6
+ make_data_platform_urn,
7
+ make_dataplatform_instance_urn,
8
+ make_dataset_urn_with_platform_instance,
9
+ make_tag_urn,
10
+ make_user_urn,
11
+ )
12
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
+ from datahub.ingestion.source.grafana.models import Dashboard, Panel
14
+ from datahub.ingestion.source.grafana.types import CHART_TYPE_MAPPINGS
15
+ from datahub.metadata.schema_classes import (
16
+ ChangeAuditStampsClass,
17
+ ChartInfoClass,
18
+ DashboardInfoClass,
19
+ DataPlatformInstanceClass,
20
+ GlobalTagsClass,
21
+ OwnerClass,
22
+ OwnershipClass,
23
+ OwnershipTypeClass,
24
+ StatusClass,
25
+ TagAssociationClass,
26
+ )
27
+
28
+
29
+ def build_chart_mcps(
30
+ panel: Panel,
31
+ dashboard: Dashboard,
32
+ platform: str,
33
+ platform_instance: Optional[str],
34
+ env: str,
35
+ base_url: str,
36
+ ingest_tags: bool,
37
+ ) -> Tuple[Optional[str], str, List[MetadataChangeProposalWrapper]]:
38
+ """Build chart metadata change proposals"""
39
+ ds_urn = None
40
+ mcps = []
41
+
42
+ chart_urn = make_chart_urn(
43
+ platform,
44
+ f"{dashboard.uid}.{panel.id}",
45
+ platform_instance,
46
+ )
47
+
48
+ # Platform instance aspect
49
+ mcps.append(
50
+ MetadataChangeProposalWrapper(
51
+ entityUrn=chart_urn,
52
+ aspect=DataPlatformInstanceClass(
53
+ platform=make_data_platform_urn(platform),
54
+ instance=make_dataplatform_instance_urn(
55
+ platform=platform,
56
+ instance=platform_instance,
57
+ )
58
+ if platform_instance
59
+ else None,
60
+ ),
61
+ )
62
+ )
63
+
64
+ # Status aspect
65
+ mcps.append(
66
+ MetadataChangeProposalWrapper(
67
+ entityUrn=chart_urn,
68
+ aspect=StatusClass(removed=False),
69
+ )
70
+ )
71
+
72
+ # Get input datasets
73
+ input_datasets = []
74
+ if panel.datasource_ref:
75
+ ds_type = panel.datasource_ref.type or "unknown"
76
+ ds_uid = panel.datasource_ref.uid or "unknown"
77
+
78
+ # Add Grafana dataset
79
+ dataset_name = f"{ds_type}.{ds_uid}.{panel.id}"
80
+ ds_urn = make_dataset_urn_with_platform_instance(
81
+ platform=platform,
82
+ name=dataset_name,
83
+ platform_instance=platform_instance,
84
+ env=env,
85
+ )
86
+ input_datasets.append(ds_urn)
87
+
88
+ # Chart info aspect
89
+ title = panel.title or f"Panel {panel.id}"
90
+ mcps.append(
91
+ MetadataChangeProposalWrapper(
92
+ entityUrn=chart_urn,
93
+ aspect=ChartInfoClass(
94
+ type=CHART_TYPE_MAPPINGS.get(panel.type) if panel.type else None,
95
+ description=panel.description,
96
+ title=title,
97
+ lastModified=ChangeAuditStampsClass(),
98
+ chartUrl=f"{base_url}/d/{dashboard.uid}?viewPanel={panel.id}",
99
+ customProperties=_build_custom_properties(panel),
100
+ inputs=input_datasets,
101
+ ),
102
+ )
103
+ )
104
+
105
+ # Tags aspect
106
+ if dashboard.tags and ingest_tags:
107
+ tags = []
108
+ for tag in dashboard.tags:
109
+ if ":" in tag:
110
+ key, value = tag.split(":", 1)
111
+ tag_urn = make_tag_urn(f"{key}.{value}")
112
+ else:
113
+ tag_urn = make_tag_urn(tag)
114
+ tags.append(TagAssociationClass(tag=tag_urn))
115
+
116
+ if tags:
117
+ mcps.append(
118
+ MetadataChangeProposalWrapper(
119
+ entityUrn=chart_urn,
120
+ aspect=GlobalTagsClass(tags=tags),
121
+ )
122
+ )
123
+
124
+ return ds_urn, chart_urn, mcps
125
+
126
+
127
+ def build_dashboard_mcps(
128
+ dashboard: Dashboard,
129
+ platform: str,
130
+ platform_instance: Optional[str],
131
+ chart_urns: List[str],
132
+ base_url: str,
133
+ ingest_owners: bool,
134
+ ingest_tags: bool,
135
+ ) -> Tuple[str, List[MetadataChangeProposalWrapper]]:
136
+ """Build dashboard metadata change proposals"""
137
+ mcps = []
138
+ dashboard_urn = make_dashboard_urn(platform, dashboard.uid, platform_instance)
139
+
140
+ # Platform instance aspect
141
+ mcps.append(
142
+ MetadataChangeProposalWrapper(
143
+ entityUrn=dashboard_urn,
144
+ aspect=DataPlatformInstanceClass(
145
+ platform=make_data_platform_urn(platform),
146
+ instance=make_dataplatform_instance_urn(
147
+ platform=platform,
148
+ instance=platform_instance,
149
+ )
150
+ if platform_instance
151
+ else None,
152
+ ),
153
+ )
154
+ )
155
+
156
+ # Dashboard info aspect
157
+ mcps.append(
158
+ MetadataChangeProposalWrapper(
159
+ entityUrn=dashboard_urn,
160
+ aspect=DashboardInfoClass(
161
+ description=dashboard.description,
162
+ title=dashboard.title,
163
+ charts=chart_urns,
164
+ lastModified=ChangeAuditStampsClass(),
165
+ dashboardUrl=f"{base_url}/d/{dashboard.uid}",
166
+ customProperties=_build_dashboard_properties(dashboard),
167
+ ),
168
+ )
169
+ )
170
+
171
+ # Ownership aspect
172
+ if dashboard.uid and ingest_owners:
173
+ owner = _build_ownership(dashboard)
174
+ if owner:
175
+ mcps.append(
176
+ MetadataChangeProposalWrapper(
177
+ entityUrn=dashboard_urn,
178
+ aspect=owner,
179
+ )
180
+ )
181
+
182
+ # Tags aspect
183
+ if dashboard.tags and ingest_tags:
184
+ tags = [TagAssociationClass(tag=make_tag_urn(tag)) for tag in dashboard.tags]
185
+ if tags:
186
+ mcps.append(
187
+ MetadataChangeProposalWrapper(
188
+ entityUrn=dashboard_urn,
189
+ aspect=GlobalTagsClass(tags=tags),
190
+ )
191
+ )
192
+
193
+ # Status aspect
194
+ mcps.append(
195
+ MetadataChangeProposalWrapper(
196
+ entityUrn=dashboard_urn,
197
+ aspect=StatusClass(removed=False),
198
+ )
199
+ )
200
+
201
+ return dashboard_urn, mcps
202
+
203
+
204
+ def _build_custom_properties(panel: Panel) -> Dict[str, str]:
205
+ """Build custom properties for chart"""
206
+ props = {}
207
+
208
+ if panel.type:
209
+ props["type"] = panel.type
210
+
211
+ if panel.datasource_ref:
212
+ props["datasourceType"] = panel.datasource_ref.type or ""
213
+ props["datasourceUid"] = panel.datasource_ref.uid or ""
214
+
215
+ for key in [
216
+ "description",
217
+ "format",
218
+ "pluginVersion",
219
+ "repeatDirection",
220
+ "maxDataPoints",
221
+ ]:
222
+ value = getattr(panel, key, None)
223
+ if value:
224
+ props[key] = str(value)
225
+
226
+ if panel.query_targets:
227
+ props["targetsCount"] = str(len(panel.query_targets))
228
+
229
+ return props
230
+
231
+
232
+ def _build_dashboard_properties(dashboard: Dashboard) -> Dict[str, str]:
233
+ """Build custom properties for dashboard"""
234
+ props = {}
235
+
236
+ if dashboard.timezone:
237
+ props["timezone"] = dashboard.timezone
238
+
239
+ if dashboard.schema_version:
240
+ props["schema_version"] = dashboard.schema_version
241
+
242
+ if dashboard.version:
243
+ props["version"] = dashboard.version
244
+
245
+ if dashboard.refresh:
246
+ props["refresh"] = dashboard.refresh
247
+
248
+ return props
249
+
250
+
251
+ def _build_ownership(dashboard: Dashboard) -> Optional[OwnershipClass]:
252
+ """Build ownership information"""
253
+ owners = []
254
+
255
+ if dashboard.uid:
256
+ owners.append(
257
+ OwnerClass(
258
+ owner=make_user_urn(dashboard.uid),
259
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
260
+ )
261
+ )
262
+
263
+ if dashboard.created_by:
264
+ owner_id = dashboard.created_by.split("@")[0]
265
+ owners.append(
266
+ OwnerClass(
267
+ owner=make_user_urn(owner_id),
268
+ type=OwnershipTypeClass.DATAOWNER,
269
+ )
270
+ )
271
+
272
+ return OwnershipClass(owners=owners) if owners else None