acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,197 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ from datahub.ingestion.source.unity.platform_resource_repository import (
6
+ UnityCatalogPlatformResourceRepository,
7
+ )
8
+
9
+ from pydantic import BaseModel
10
+
11
+ from datahub.api.entities.external.external_entities import (
12
+ ExternalEntity,
13
+ ExternalEntityId,
14
+ LinkedResourceSet,
15
+ )
16
+ from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
17
+ from datahub.api.entities.platformresource.platform_resource import (
18
+ PlatformResource,
19
+ PlatformResourceKey,
20
+ )
21
+ from datahub.ingestion.graph.client import DataHubGraph
22
+ from datahub.metadata.urns import TagUrn
23
+ from datahub.utilities.urns.urn import Urn
24
+
25
+
26
+ class UnityCatalogTagSyncContext(BaseModel):
27
+ # it is intentionally empty
28
+ platform_instance: Optional[str] = None
29
+
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class UnityCatalogTagPlatformResourceId(ExternalEntityId):
35
+ """
36
+ A Unity Catalog tag platform resource ID.
37
+ """
38
+
39
+ tag_key: str
40
+ tag_value: Optional[str] = None
41
+ platform_instance: Optional[str] = None
42
+ exists_in_unity_catalog: bool = False
43
+ persisted: bool = False
44
+
45
+ # this is a hack to make sure the property is a string and not private pydantic field
46
+ @staticmethod
47
+ def _RESOURCE_TYPE() -> str:
48
+ return "UnityCatalogTagPlatformResource"
49
+
50
+ def to_platform_resource_key(self) -> PlatformResourceKey:
51
+ return PlatformResourceKey(
52
+ platform="databricks",
53
+ resource_type=str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
54
+ primary_key=f"{self.tag_key}:{self.tag_value}",
55
+ platform_instance=self.platform_instance,
56
+ )
57
+
58
+ @classmethod
59
+ def get_or_create_from_tag(
60
+ cls,
61
+ tag: UnityCatalogTag,
62
+ platform_resource_repository: "UnityCatalogPlatformResourceRepository",
63
+ exists_in_unity_catalog: bool = False,
64
+ ) -> "UnityCatalogTagPlatformResourceId":
65
+ """
66
+ Creates a UnityCatalogTagPlatformResourceId from a UnityCatalogTag.
67
+ """
68
+
69
+ existing_platform_resource = platform_resource_repository.search_entity_by_urn(
70
+ tag.to_datahub_tag_urn().urn()
71
+ )
72
+ if existing_platform_resource:
73
+ logger.debug(
74
+ f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.raw_text}: {existing_platform_resource}"
75
+ )
76
+ return existing_platform_resource
77
+
78
+ return UnityCatalogTagPlatformResourceId(
79
+ tag_key=tag.key.raw_text,
80
+ tag_value=tag.value.raw_text if tag.value is not None else None,
81
+ platform_instance=platform_resource_repository.platform_instance,
82
+ exists_in_unity_catalog=exists_in_unity_catalog,
83
+ persisted=False,
84
+ )
85
+
86
+ @classmethod
87
+ def from_datahub_urn(
88
+ cls,
89
+ urn: str,
90
+ tag_sync_context: UnityCatalogTagSyncContext,
91
+ platform_resource_repository: "UnityCatalogPlatformResourceRepository",
92
+ graph: DataHubGraph,
93
+ ) -> "UnityCatalogTagPlatformResourceId":
94
+ """
95
+ Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
96
+ """
97
+ existing_platform_resource_id = (
98
+ platform_resource_repository.search_entity_by_urn(urn)
99
+ )
100
+ if existing_platform_resource_id:
101
+ return existing_platform_resource_id
102
+
103
+ new_unity_catalog_tag_id = cls.generate_tag_id(graph, tag_sync_context, urn)
104
+ if new_unity_catalog_tag_id:
105
+ resource_key = platform_resource_repository.get(
106
+ new_unity_catalog_tag_id.to_platform_resource_key()
107
+ )
108
+ if resource_key:
109
+ # Create a new ID with the correct state instead of mutating
110
+ return UnityCatalogTagPlatformResourceId(
111
+ tag_key=new_unity_catalog_tag_id.tag_key,
112
+ tag_value=new_unity_catalog_tag_id.tag_value,
113
+ platform_instance=new_unity_catalog_tag_id.platform_instance,
114
+ exists_in_unity_catalog=True, # This tag exists in Unity Catalog
115
+ persisted=new_unity_catalog_tag_id.persisted,
116
+ )
117
+ return new_unity_catalog_tag_id
118
+ raise ValueError(
119
+ f"Unable to create Unity Catalog tag ID from DataHub URN: {urn}"
120
+ )
121
+
122
+ @classmethod
123
+ def generate_tag_id(
124
+ cls, graph: DataHubGraph, tag_sync_context: UnityCatalogTagSyncContext, urn: str
125
+ ) -> "UnityCatalogTagPlatformResourceId":
126
+ parsed_urn = Urn.from_string(urn)
127
+ entity_type = parsed_urn.entity_type
128
+ if entity_type == "tag":
129
+ return UnityCatalogTagPlatformResourceId.from_datahub_tag(
130
+ TagUrn.from_string(urn), tag_sync_context
131
+ )
132
+ else:
133
+ raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
134
+
135
+ @classmethod
136
+ def from_datahub_tag(
137
+ cls, tag_urn: TagUrn, tag_sync_context: UnityCatalogTagSyncContext
138
+ ) -> "UnityCatalogTagPlatformResourceId":
139
+ uc_tag = UnityCatalogTag.from_urn(tag_urn)
140
+
141
+ return UnityCatalogTagPlatformResourceId(
142
+ tag_key=str(uc_tag.key),
143
+ tag_value=str(uc_tag.value) if uc_tag.value is not None else None,
144
+ platform_instance=tag_sync_context.platform_instance,
145
+ exists_in_unity_catalog=False,
146
+ )
147
+
148
+
149
+ class UnityCatalogTagPlatformResource(ExternalEntity):
150
+ datahub_urns: LinkedResourceSet
151
+ managed_by_datahub: bool
152
+ id: UnityCatalogTagPlatformResourceId
153
+ allowed_values: Optional[List[str]] = None
154
+
155
+ def get_id(self) -> ExternalEntityId:
156
+ return self.id
157
+
158
+ def is_managed_by_datahub(self) -> bool:
159
+ return self.managed_by_datahub
160
+
161
+ def datahub_linked_resources(self) -> LinkedResourceSet:
162
+ return self.datahub_urns
163
+
164
+ def as_platform_resource(self) -> PlatformResource:
165
+ return PlatformResource.create(
166
+ key=self.id.to_platform_resource_key(),
167
+ secondary_keys=[u for u in self.datahub_urns.urns],
168
+ value=self,
169
+ )
170
+
171
+ @classmethod
172
+ def create_default(
173
+ cls,
174
+ entity_id: ExternalEntityId,
175
+ managed_by_datahub: bool,
176
+ ) -> "UnityCatalogTagPlatformResource":
177
+ """Create a default Unity Catalog tag entity when none found in DataHub."""
178
+ # Type narrowing: we know this will be a UnityCatalogTagPlatformResourceId
179
+ assert isinstance(entity_id, UnityCatalogTagPlatformResourceId), (
180
+ f"Expected UnityCatalogTagPlatformResourceId, got {type(entity_id)}"
181
+ )
182
+
183
+ # Create a new entity ID with correct default state instead of mutating
184
+ default_entity_id = UnityCatalogTagPlatformResourceId(
185
+ tag_key=entity_id.tag_key,
186
+ tag_value=entity_id.tag_value,
187
+ platform_instance=entity_id.platform_instance,
188
+ exists_in_unity_catalog=False, # New entities don't exist in Unity Catalog yet
189
+ persisted=False, # New entities are not persisted yet
190
+ )
191
+
192
+ return cls(
193
+ id=default_entity_id,
194
+ datahub_urns=LinkedResourceSet(urns=[]),
195
+ managed_by_datahub=managed_by_datahub,
196
+ allowed_values=None,
197
+ )
@@ -11,7 +11,10 @@ from databricks.sdk.service.sql import QueryStatementType
11
11
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
12
12
  from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics
13
13
  from datahub.ingestion.api.workunit import MetadataWorkUnit
14
- from datahub.ingestion.source.unity.config import UnityCatalogSourceConfig
14
+ from datahub.ingestion.source.unity.config import (
15
+ UnityCatalogSourceConfig,
16
+ UsageDataSource,
17
+ )
15
18
  from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
16
19
  from datahub.ingestion.source.unity.proxy_types import (
17
20
  OPERATION_STATEMENT_TYPES,
@@ -164,11 +167,50 @@ class UnityCatalogUsageExtractor:
164
167
  aspect=operation_aspect,
165
168
  ).as_workunit()
166
169
 
170
+ def _validate_usage_data_source_config(self) -> None:
171
+ """Validate usage data source configuration before execution."""
172
+ usage_data_source = self.config.usage_data_source
173
+
174
+ if (
175
+ usage_data_source == UsageDataSource.SYSTEM_TABLES
176
+ and not self.proxy.warehouse_id
177
+ ):
178
+ raise ValueError(
179
+ "usage_data_source is set to SYSTEM_TABLES but warehouse_id is not configured. "
180
+ "Either set warehouse_id or use AUTO/API mode."
181
+ )
182
+
167
183
  def _get_queries(self) -> Iterable[Query]:
168
184
  try:
169
- yield from self.proxy.query_history(
170
- self.config.start_time, self.config.end_time
171
- )
185
+ self._validate_usage_data_source_config()
186
+ usage_data_source = self.config.usage_data_source
187
+
188
+ if usage_data_source == UsageDataSource.AUTO:
189
+ if self.proxy.warehouse_id:
190
+ logger.info(
191
+ "Using system tables for usage query history (AUTO mode)"
192
+ )
193
+ yield from self.proxy.get_query_history_via_system_tables(
194
+ self.config.start_time, self.config.end_time
195
+ )
196
+ else:
197
+ logger.info(
198
+ "Using API for usage query history (AUTO mode, no warehouse)"
199
+ )
200
+ yield from self.proxy.query_history(
201
+ self.config.start_time, self.config.end_time
202
+ )
203
+ elif usage_data_source == UsageDataSource.SYSTEM_TABLES:
204
+ logger.info("Using system tables for usage query history (forced)")
205
+ yield from self.proxy.get_query_history_via_system_tables(
206
+ self.config.start_time, self.config.end_time
207
+ )
208
+ elif usage_data_source == UsageDataSource.API:
209
+ logger.info("Using API for usage query history (forced)")
210
+ yield from self.proxy.query_history(
211
+ self.config.start_time, self.config.end_time
212
+ )
213
+
172
214
  except Exception as e:
173
215
  logger.warning("Error getting queries", exc_info=True)
174
216
  self.report.report_warning("get-queries", str(e))
@@ -2,7 +2,7 @@ import collections
2
2
  import dataclasses
3
3
  import logging
4
4
  from datetime import datetime
5
- from typing import Dict, Iterable, List
5
+ from typing import Any, Dict, Iterable, List, Optional
6
6
 
7
7
  from dateutil import parser
8
8
  from pydantic.fields import Field
@@ -74,15 +74,22 @@ class ClickHouseUsageConfig(ClickHouseConfig, BaseUsageConfig, EnvConfigMixin):
74
74
  options: dict = Field(default={}, description="")
75
75
  query_log_table: str = Field(default="system.query_log", exclude=True)
76
76
 
77
- def get_sql_alchemy_url(self):
78
- return super().get_sql_alchemy_url()
77
+ def get_sql_alchemy_url(
78
+ self,
79
+ uri_opts: Optional[Dict[str, Any]] = None,
80
+ current_db: Optional[str] = None,
81
+ ) -> str:
82
+ return super().get_sql_alchemy_url(uri_opts=uri_opts, current_db=current_db)
79
83
 
80
84
 
81
85
  @platform_name("ClickHouse")
82
86
  @config_class(ClickHouseUsageConfig)
83
87
  @support_status(SupportStatus.CERTIFIED)
84
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
88
+ @capability(
89
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
90
+ )
85
91
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
92
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
86
93
  @dataclasses.dataclass
87
94
  class ClickHouseUsageSource(Source):
88
95
  """
@@ -4,7 +4,7 @@ import json
4
4
  import logging
5
5
  from datetime import datetime
6
6
  from email.utils import parseaddr
7
- from typing import Dict, Iterable, List, Optional
7
+ from typing import Any, Dict, Iterable, List, Optional
8
8
 
9
9
  from dateutil import parser
10
10
  from pydantic.fields import Field
@@ -15,7 +15,9 @@ from sqlalchemy.engine import Engine
15
15
  import datahub.emitter.mce_builder as builder
16
16
  from datahub.configuration.time_window_config import get_time_bucket
17
17
  from datahub.ingestion.api.decorators import (
18
+ SourceCapability,
18
19
  SupportStatus,
20
+ capability,
19
21
  config_class,
20
22
  platform_name,
21
23
  support_status,
@@ -58,7 +60,7 @@ AggregatedDataset = GenericAggregatedDataset[TrinoTableRef]
58
60
 
59
61
  class TrinoConnectorInfo(BaseModel):
60
62
  partitionIds: List[str]
61
- truncated: Optional[bool]
63
+ truncated: Optional[bool] = None
62
64
 
63
65
 
64
66
  class TrinoAccessedMetadata(BaseModel):
@@ -78,7 +80,7 @@ class TrinoJoinedAccessEvent(BaseModel):
78
80
  table: Optional[str] = None
79
81
  accessed_metadata: List[TrinoAccessedMetadata]
80
82
  starttime: datetime = Field(alias="create_time")
81
- endtime: Optional[datetime] = Field(alias="end_time")
83
+ endtime: Optional[datetime] = Field(None, alias="end_time")
82
84
 
83
85
 
84
86
  class EnvBasedSourceBaseConfig:
@@ -98,8 +100,10 @@ class TrinoUsageConfig(TrinoConfig, BaseUsageConfig, EnvBasedSourceBaseConfig):
98
100
  options: dict = Field(default={}, description="")
99
101
  database: str = Field(description="The name of the catalog from getting the usage")
100
102
 
101
- def get_sql_alchemy_url(self):
102
- return super().get_sql_alchemy_url()
103
+ def get_sql_alchemy_url(
104
+ self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
105
+ ) -> str:
106
+ return super().get_sql_alchemy_url(uri_opts=uri_opts, database=database)
103
107
 
104
108
 
105
109
  @dataclasses.dataclass
@@ -110,6 +114,7 @@ class TrinoUsageReport(SourceReport):
110
114
  @platform_name("Trino")
111
115
  @config_class(TrinoUsageConfig)
112
116
  @support_status(SupportStatus.CERTIFIED)
117
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
113
118
  @dataclasses.dataclass
114
119
  class TrinoUsageSource(Source):
115
120
  """
@@ -12,15 +12,13 @@ from typing import (
12
12
  Optional,
13
13
  Tuple,
14
14
  TypeVar,
15
- Union,
16
15
  )
17
16
 
18
17
  import pydantic
19
- from deprecated import deprecated
20
18
  from pydantic.fields import Field
21
19
 
22
20
  import datahub.emitter.mce_builder as builder
23
- from datahub.configuration.common import AllowDenyPattern
21
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
24
22
  from datahub.configuration.time_window_config import (
25
23
  BaseTimeWindowConfig,
26
24
  BucketDuration,
@@ -28,19 +26,13 @@ from datahub.configuration.time_window_config import (
28
26
  )
29
27
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
30
28
  from datahub.ingestion.api.workunit import MetadataWorkUnit
31
- from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetUsageStatistics
32
29
  from datahub.metadata.schema_classes import (
33
- CalendarIntervalClass,
34
30
  DatasetFieldUsageCountsClass,
35
31
  DatasetUsageStatisticsClass,
36
32
  DatasetUserUsageCountsClass,
37
33
  TimeWindowSizeClass,
38
- UsageAggregationClass,
39
- WindowDurationClass,
40
34
  )
41
35
  from datahub.utilities.sql_formatter import format_sql_query, trim_query
42
- from datahub.utilities.urns.dataset_urn import DatasetUrn
43
- from datahub.utilities.urns.urn import guess_entity_type
44
36
 
45
37
  logger = logging.getLogger(__name__)
46
38
 
@@ -202,13 +194,13 @@ class GenericAggregatedDataset(Generic[ResourceType]):
202
194
 
203
195
 
204
196
  class BaseUsageConfig(BaseTimeWindowConfig):
205
- queries_character_limit: int = Field(
197
+ queries_character_limit: HiddenFromDocs[int] = Field(
198
+ # Hidden since we don't want to encourage people to break elasticsearch.
206
199
  default=DEFAULT_QUERIES_CHARACTER_LIMIT,
207
200
  description=(
208
201
  "Total character limit for all queries in a single usage aspect."
209
202
  " Queries will be truncated to length `queries_character_limit / top_n_queries`."
210
203
  ),
211
- hidden_from_docs=True, # Don't want to encourage people to break elasticsearch
212
204
  )
213
205
 
214
206
  top_n_queries: pydantic.PositiveInt = Field(
@@ -276,6 +268,7 @@ class UsageAggregator(Generic[ResourceType]):
276
268
  user,
277
269
  query,
278
270
  fields,
271
+ user_email_pattern=self.config.user_email_pattern,
279
272
  count=count,
280
273
  )
281
274
 
@@ -295,60 +288,3 @@ class UsageAggregator(Generic[ResourceType]):
295
288
  user_urn_builder=user_urn_builder,
296
289
  queries_character_limit=self.config.queries_character_limit,
297
290
  )
298
-
299
-
300
- @deprecated
301
- def convert_usage_aggregation_class(
302
- obj: UsageAggregationClass,
303
- ) -> MetadataChangeProposalWrapper:
304
- # Legacy usage aggregation only supported dataset usage stats
305
- if guess_entity_type(obj.resource) == DatasetUrn.ENTITY_TYPE:
306
- aspect = DatasetUsageStatistics(
307
- timestampMillis=obj.bucket,
308
- eventGranularity=TimeWindowSizeClass(
309
- unit=convert_window_to_interval(obj.duration)
310
- ),
311
- uniqueUserCount=obj.metrics.uniqueUserCount,
312
- totalSqlQueries=obj.metrics.totalSqlQueries,
313
- topSqlQueries=obj.metrics.topSqlQueries,
314
- userCounts=(
315
- [
316
- DatasetUserUsageCountsClass(
317
- user=u.user, count=u.count, userEmail=u.userEmail
318
- )
319
- for u in obj.metrics.users
320
- if u.user is not None
321
- ]
322
- if obj.metrics.users
323
- else None
324
- ),
325
- fieldCounts=(
326
- [
327
- DatasetFieldUsageCountsClass(fieldPath=f.fieldName, count=f.count)
328
- for f in obj.metrics.fields
329
- ]
330
- if obj.metrics.fields
331
- else None
332
- ),
333
- )
334
- return MetadataChangeProposalWrapper(entityUrn=obj.resource, aspect=aspect)
335
- else:
336
- raise Exception(
337
- f"Skipping unsupported usage aggregation - invalid entity type: {obj}"
338
- )
339
-
340
-
341
- @deprecated
342
- def convert_window_to_interval(window: Union[str, WindowDurationClass]) -> str:
343
- if window == WindowDurationClass.YEAR:
344
- return CalendarIntervalClass.YEAR
345
- elif window == WindowDurationClass.MONTH:
346
- return CalendarIntervalClass.MONTH
347
- elif window == WindowDurationClass.WEEK:
348
- return CalendarIntervalClass.WEEK
349
- elif window == WindowDurationClass.DAY:
350
- return CalendarIntervalClass.DAY
351
- elif window == WindowDurationClass.HOUR:
352
- return CalendarIntervalClass.HOUR
353
- else:
354
- raise Exception(f"Unsupported window duration: {window}")
File without changes