acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,202 @@
1
+ import logging
2
+ from typing import Dict, List, Optional, Tuple
3
+
4
+ from datahub.emitter.mce_builder import (
5
+ make_dataset_urn_with_platform_instance,
6
+ make_schema_field_urn,
7
+ )
8
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
9
+ from datahub.ingestion.graph.client import DataHubGraph
10
+ from datahub.ingestion.source.grafana.grafana_config import PlatformConnectionConfig
11
+ from datahub.ingestion.source.grafana.models import (
12
+ DatasourceRef,
13
+ GrafanaQueryTarget,
14
+ Panel,
15
+ )
16
+ from datahub.ingestion.source.grafana.report import GrafanaSourceReport
17
+ from datahub.metadata.schema_classes import (
18
+ DatasetLineageTypeClass,
19
+ FineGrainedLineageClass,
20
+ FineGrainedLineageDownstreamTypeClass,
21
+ FineGrainedLineageUpstreamTypeClass,
22
+ UpstreamClass,
23
+ UpstreamLineageClass,
24
+ )
25
+ from datahub.sql_parsing.sqlglot_lineage import (
26
+ SqlParsingResult,
27
+ create_lineage_sql_parsed_result,
28
+ )
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class LineageExtractor:
34
+ """Handles extraction of lineage information from Grafana panels"""
35
+
36
+ def __init__(
37
+ self,
38
+ platform: str,
39
+ platform_instance: Optional[str],
40
+ env: str,
41
+ connection_to_platform_map: Dict[str, PlatformConnectionConfig],
42
+ report: GrafanaSourceReport,
43
+ graph: Optional[DataHubGraph] = None,
44
+ include_column_lineage: bool = True,
45
+ ):
46
+ self.platform = platform
47
+ self.platform_instance = platform_instance
48
+ self.env = env
49
+ self.connection_map = connection_to_platform_map
50
+ self.graph = graph
51
+ self.report = report
52
+ self.include_column_lineage = include_column_lineage
53
+
54
+ def extract_panel_lineage(
55
+ self, panel: Panel
56
+ ) -> Optional[MetadataChangeProposalWrapper]:
57
+ """Extract lineage information from a panel."""
58
+ if not panel.datasource_ref:
59
+ return None
60
+
61
+ ds_type, ds_uid = self._extract_datasource_info(panel.datasource_ref)
62
+ raw_sql = self._extract_raw_sql(panel.query_targets)
63
+ ds_urn = self._build_dataset_urn(ds_type, ds_uid, panel.id)
64
+
65
+ # Handle platform-specific lineage
66
+ if ds_uid in self.connection_map:
67
+ if raw_sql:
68
+ parsed_sql = self._parse_sql(raw_sql, self.connection_map[ds_uid])
69
+ if parsed_sql:
70
+ lineage = self._create_column_lineage(ds_urn, parsed_sql)
71
+ if lineage:
72
+ return lineage
73
+
74
+ # Fall back to basic lineage if SQL parsing fails or no column lineage created
75
+ return self._create_basic_lineage(
76
+ ds_uid, self.connection_map[ds_uid], ds_urn
77
+ )
78
+
79
+ return None
80
+
81
+ def _extract_datasource_info(
82
+ self, datasource_ref: "DatasourceRef"
83
+ ) -> Tuple[str, str]:
84
+ """Extract datasource type and UID."""
85
+ return datasource_ref.type or "unknown", datasource_ref.uid or "unknown"
86
+
87
+ def _extract_raw_sql(
88
+ self, query_targets: List["GrafanaQueryTarget"]
89
+ ) -> Optional[str]:
90
+ """Extract raw SQL from panel query targets."""
91
+ for target in query_targets:
92
+ if target.get("rawSql"):
93
+ return target["rawSql"]
94
+ return None
95
+
96
+ def _build_dataset_urn(self, ds_type: str, ds_uid: str, panel_id: str) -> str:
97
+ """Build dataset URN."""
98
+ dataset_name = f"{ds_type}.{ds_uid}.{panel_id}"
99
+ return make_dataset_urn_with_platform_instance(
100
+ platform=self.platform,
101
+ name=dataset_name,
102
+ platform_instance=self.platform_instance,
103
+ env=self.env,
104
+ )
105
+
106
+ def _create_basic_lineage(
107
+ self, ds_uid: str, platform_config: PlatformConnectionConfig, ds_urn: str
108
+ ) -> MetadataChangeProposalWrapper:
109
+ """Create basic upstream lineage."""
110
+ name = (
111
+ f"{platform_config.database}.{ds_uid}"
112
+ if platform_config.database
113
+ else ds_uid
114
+ )
115
+
116
+ upstream_urn = make_dataset_urn_with_platform_instance(
117
+ platform=platform_config.platform,
118
+ name=name,
119
+ platform_instance=platform_config.platform_instance,
120
+ env=platform_config.env,
121
+ )
122
+
123
+ logger.info(f"Generated upstream URN: {upstream_urn}")
124
+
125
+ return MetadataChangeProposalWrapper(
126
+ entityUrn=ds_urn,
127
+ aspect=UpstreamLineageClass(
128
+ upstreams=[
129
+ UpstreamClass(
130
+ dataset=upstream_urn,
131
+ type=DatasetLineageTypeClass.TRANSFORMED,
132
+ )
133
+ ]
134
+ ),
135
+ )
136
+
137
+ def _parse_sql(
138
+ self, sql: str, platform_config: PlatformConnectionConfig
139
+ ) -> Optional[SqlParsingResult]:
140
+ """Parse SQL query for lineage information."""
141
+ if not self.graph:
142
+ logger.warning("No DataHub graph specified for SQL parsing.")
143
+ return None
144
+
145
+ try:
146
+ return create_lineage_sql_parsed_result(
147
+ query=sql,
148
+ platform=platform_config.platform,
149
+ platform_instance=platform_config.platform_instance,
150
+ env=platform_config.env,
151
+ default_db=platform_config.database,
152
+ default_schema=platform_config.database_schema,
153
+ graph=self.graph,
154
+ )
155
+ except ValueError as e:
156
+ logger.error(f"SQL parsing error for query: {sql}", exc_info=e)
157
+ except Exception as e:
158
+ logger.exception(f"Unexpected error during SQL parsing: {sql}", exc_info=e)
159
+
160
+ return None
161
+
162
+ def _create_column_lineage(
163
+ self,
164
+ dataset_urn: str,
165
+ parsed_sql: SqlParsingResult,
166
+ ) -> Optional[MetadataChangeProposalWrapper]:
167
+ """Create column-level lineage"""
168
+ if not parsed_sql.column_lineage or not self.include_column_lineage:
169
+ return None
170
+
171
+ upstream_lineages = []
172
+ for col_lineage in parsed_sql.column_lineage:
173
+ upstream_lineages.append(
174
+ FineGrainedLineageClass(
175
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
176
+ downstreams=[
177
+ make_schema_field_urn(
178
+ dataset_urn, col_lineage.downstream.column
179
+ )
180
+ ],
181
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
182
+ upstreams=[
183
+ make_schema_field_urn(upstream_dataset, col.column)
184
+ for col in col_lineage.upstreams
185
+ for upstream_dataset in parsed_sql.in_tables
186
+ ],
187
+ )
188
+ )
189
+
190
+ return MetadataChangeProposalWrapper(
191
+ entityUrn=dataset_urn,
192
+ aspect=UpstreamLineageClass(
193
+ upstreams=[
194
+ UpstreamClass(
195
+ dataset=table,
196
+ type=DatasetLineageTypeClass.TRANSFORMED,
197
+ )
198
+ for table in parsed_sql.in_tables
199
+ ],
200
+ fineGrainedLineages=upstream_lineages,
201
+ ),
202
+ )
@@ -0,0 +1,137 @@
1
+ """Grafana data models for DataHub ingestion.
2
+
3
+ References:
4
+ - Grafana HTTP API: https://grafana.com/docs/grafana/latest/developers/http_api/
5
+ - Dashboard API: https://grafana.com/docs/grafana/latest/developers/http_api/dashboard/
6
+ - Folder API: https://grafana.com/docs/grafana/latest/developers/http_api/folder/
7
+ - Search API: https://grafana.com/docs/grafana/latest/developers/http_api/other/#search-api
8
+ - Dashboard JSON structure: https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/view-dashboard-json-model/
9
+ """
10
+
11
+ import logging
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ from pydantic import BaseModel, ConfigDict, Field
15
+
16
+ from datahub.emitter.mcp_builder import ContainerKey
17
+
18
+ logger = logging.getLogger(__name__)
19
+ # Grafana-specific type definitions for better type safety
20
+ GrafanaQueryTarget = Dict[
21
+ str, Any
22
+ ] # Query targets: refId, expr/query, datasource, hide, etc.
23
+ GrafanaFieldConfig = Dict[
24
+ str, Any
25
+ ] # Field config: defaults, overrides, display settings
26
+ GrafanaTransformation = Dict[str, Any] # Transformations: id, options
27
+
28
+
29
+ class _GrafanaBaseModel(BaseModel):
30
+ model_config = ConfigDict(coerce_numbers_to_str=True)
31
+
32
+
33
+ class DatasourceRef(_GrafanaBaseModel):
34
+ """Reference to a Grafana datasource."""
35
+
36
+ type: Optional[str] = None # Datasource type (prometheus, mysql, postgres, etc.)
37
+ uid: Optional[str] = None # Datasource unique identifier
38
+ name: Optional[str] = None # Datasource display name
39
+
40
+
41
+ class Panel(_GrafanaBaseModel):
42
+ """Represents a Grafana dashboard panel."""
43
+
44
+ id: str
45
+ title: str
46
+ description: str = ""
47
+ type: Optional[str] = None
48
+ # Query targets - each contains refId (A,B,C...), query/expr, datasource ref, etc.
49
+ query_targets: List[GrafanaQueryTarget] = Field(
50
+ default_factory=list, alias="targets"
51
+ )
52
+ # Datasource reference - contains type, uid, name
53
+ datasource_ref: Optional[DatasourceRef] = Field(default=None, alias="datasource")
54
+ # Field configuration - display settings, defaults, overrides
55
+ field_config: GrafanaFieldConfig = Field(default_factory=dict, alias="fieldConfig")
56
+ # Data transformations - each contains id and transformation-specific options
57
+ transformations: List[GrafanaTransformation] = Field(default_factory=list)
58
+
59
+
60
+ class Dashboard(_GrafanaBaseModel):
61
+ """Represents a Grafana dashboard."""
62
+
63
+ uid: str
64
+ title: str
65
+ description: str = ""
66
+ version: Optional[str] = None
67
+ panels: List[Panel]
68
+ tags: List[str]
69
+ timezone: Optional[str] = None
70
+ refresh: Optional[str] = None
71
+ schema_version: Optional[str] = Field(default=None, alias="schemaVersion")
72
+ folder_id: Optional[str] = Field(default=None, alias="meta.folderId")
73
+ created_by: Optional[str] = None
74
+
75
+ @staticmethod
76
+ def extract_panels(panels_data: List[Dict[str, Any]]) -> List[Panel]:
77
+ """Extract panels, including nested ones."""
78
+ panels: List[Panel] = []
79
+ for panel_data in panels_data:
80
+ if panel_data.get("type") == "row" and "panels" in panel_data:
81
+ panels.extend(
82
+ Panel.parse_obj(p)
83
+ for p in panel_data["panels"]
84
+ if p.get("type") != "row"
85
+ )
86
+ elif panel_data.get("type") != "row":
87
+ panels.append(Panel.parse_obj(panel_data))
88
+ return panels
89
+
90
+ @classmethod
91
+ def parse_obj(cls, data: Dict[str, Any]) -> "Dashboard":
92
+ """Custom parsing to handle nested panel extraction."""
93
+ dashboard_data = data.get("dashboard", {})
94
+ _panel_data = dashboard_data.get("panels", [])
95
+ panels = []
96
+ try:
97
+ panels = cls.extract_panels(_panel_data)
98
+ except Exception as e:
99
+ logger.warning(
100
+ f"Error extracting panels from dashboard for dashboard panels {_panel_data} : {e}"
101
+ )
102
+
103
+ # Extract meta.folderId from nested structure
104
+ meta = dashboard_data.get("meta", {})
105
+ folder_id = meta.get("folderId")
106
+
107
+ # Create dashboard data without meta to avoid conflicts
108
+ dashboard_dict = {**dashboard_data, "panels": panels, "folder_id": folder_id}
109
+ if "meta" in dashboard_dict:
110
+ del dashboard_dict["meta"]
111
+
112
+ # Handle refresh field type mismatch - convert boolean to string
113
+ if "refresh" in dashboard_dict and isinstance(dashboard_dict["refresh"], bool):
114
+ dashboard_dict["refresh"] = str(dashboard_dict["refresh"])
115
+
116
+ return super().parse_obj(dashboard_dict)
117
+
118
+
119
+ class Folder(_GrafanaBaseModel):
120
+ """Represents a Grafana folder."""
121
+
122
+ id: str
123
+ title: str
124
+ description: Optional[str] = ""
125
+
126
+
127
+ class FolderKey(ContainerKey):
128
+ """Key for identifying a Grafana folder."""
129
+
130
+ folder_id: str
131
+
132
+
133
+ class DashboardContainerKey(ContainerKey):
134
+ """Key for identifying a Grafana dashboard."""
135
+
136
+ dashboard_id: str
137
+ folder_id: Optional[str] = None # Reference to parent folder
@@ -0,0 +1,90 @@
1
+ from dataclasses import dataclass
2
+
3
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
4
+ StaleEntityRemovalSourceReport,
5
+ )
6
+
7
+
8
+ @dataclass
9
+ class GrafanaSourceReport(StaleEntityRemovalSourceReport):
10
+ # Entity counters
11
+ dashboards_scanned: int = 0
12
+ charts_scanned: int = 0
13
+ folders_scanned: int = 0
14
+ datasets_scanned: int = 0
15
+
16
+ # Lineage counters
17
+ panels_with_lineage: int = 0
18
+ panels_without_lineage: int = 0
19
+ lineage_extraction_failures: int = 0
20
+ sql_parsing_attempts: int = 0
21
+ sql_parsing_successes: int = 0
22
+ sql_parsing_failures: int = 0
23
+
24
+ # Schema extraction counters
25
+ panels_with_schema_fields: int = 0
26
+ panels_without_schema_fields: int = 0
27
+
28
+ # Warning counters
29
+ permission_warnings: int = 0
30
+ datasource_warnings: int = 0
31
+ panel_parsing_warnings: int = 0
32
+
33
+ def report_dashboard_scanned(self) -> None:
34
+ self.dashboards_scanned += 1
35
+
36
+ def report_chart_scanned(self) -> None:
37
+ self.charts_scanned += 1
38
+
39
+ def report_folder_scanned(self) -> None:
40
+ self.folders_scanned += 1
41
+
42
+ def report_dataset_scanned(self) -> None:
43
+ self.datasets_scanned += 1
44
+
45
+ # Lineage reporting methods
46
+ def report_lineage_extracted(self) -> None:
47
+ """Report successful lineage extraction for a panel"""
48
+ self.panels_with_lineage += 1
49
+
50
+ def report_no_lineage(self) -> None:
51
+ """Report that no lineage was found for a panel"""
52
+ self.panels_without_lineage += 1
53
+
54
+ def report_lineage_extraction_failure(self) -> None:
55
+ """Report failure to extract lineage for a panel"""
56
+ self.lineage_extraction_failures += 1
57
+
58
+ def report_sql_parsing_attempt(self) -> None:
59
+ """Report attempt to parse SQL"""
60
+ self.sql_parsing_attempts += 1
61
+
62
+ def report_sql_parsing_success(self) -> None:
63
+ """Report successful SQL parsing"""
64
+ self.sql_parsing_successes += 1
65
+
66
+ def report_sql_parsing_failure(self) -> None:
67
+ """Report failed SQL parsing"""
68
+ self.sql_parsing_failures += 1
69
+
70
+ # Schema field reporting methods
71
+ def report_schema_fields_extracted(self) -> None:
72
+ """Report that schema fields were extracted for a panel"""
73
+ self.panels_with_schema_fields += 1
74
+
75
+ def report_no_schema_fields(self) -> None:
76
+ """Report that no schema fields were found for a panel"""
77
+ self.panels_without_schema_fields += 1
78
+
79
+ # Warning reporting methods
80
+ def report_permission_warning(self) -> None:
81
+ """Report a permission-related warning"""
82
+ self.permission_warnings += 1
83
+
84
+ def report_datasource_warning(self) -> None:
85
+ """Report a datasource-related warning"""
86
+ self.datasource_warnings += 1
87
+
88
+ def report_panel_parsing_warning(self) -> None:
89
+ """Report a panel parsing warning"""
90
+ self.panel_parsing_warnings += 1
@@ -0,0 +1,16 @@
1
+ from datahub.metadata.schema_classes import (
2
+ ChartTypeClass,
3
+ )
4
+
5
+ CHART_TYPE_MAPPINGS = {
6
+ "graph": ChartTypeClass.LINE,
7
+ "timeseries": ChartTypeClass.LINE,
8
+ "table": ChartTypeClass.TABLE,
9
+ "stat": ChartTypeClass.TEXT,
10
+ "gauge": ChartTypeClass.TEXT,
11
+ "bargauge": ChartTypeClass.TEXT,
12
+ "bar": ChartTypeClass.BAR,
13
+ "pie": ChartTypeClass.PIE,
14
+ "heatmap": ChartTypeClass.TABLE,
15
+ "histogram": ChartTypeClass.BAR,
16
+ }
@@ -5,7 +5,9 @@ from typing import Any, Dict, Generator, List, Optional, Union
5
5
 
6
6
  import requests
7
7
  from pydantic import BaseModel, Field, ValidationError, validator
8
+ from requests.adapters import HTTPAdapter
8
9
  from typing_extensions import assert_never
10
+ from urllib3.util.retry import Retry
9
11
 
10
12
  from datahub.ingestion.api.source import SourceReport
11
13
  from datahub.ingestion.source.hex.constants import (
@@ -220,6 +222,7 @@ class HexApi:
220
222
  self.base_url = base_url
221
223
  self.report = report
222
224
  self.page_size = page_size
225
+ self.session = self._create_retry_session()
223
226
 
224
227
  def _list_projects_url(self):
225
228
  return f"{self.base_url}/projects"
@@ -227,6 +230,28 @@ class HexApi:
227
230
  def _auth_header(self):
228
231
  return {"Authorization": f"Bearer {self.token}"}
229
232
 
233
+ def _create_retry_session(self) -> requests.Session:
234
+ """Create a requests session with retry logic for rate limiting.
235
+
236
+ Hex API rate limit: 60 requests per minute
237
+ https://learn.hex.tech/docs/api/api-overview#kernel-and-rate-limits
238
+ """
239
+ session = requests.Session()
240
+
241
+ # Configure retry strategy for 429 (Too Many Requests) with exponential backoff
242
+ retry_strategy = Retry(
243
+ total=5, # Maximum number of retries
244
+ status_forcelist=[429], # Only retry on 429 status code
245
+ backoff_factor=2, # Exponential backoff: 2, 4, 8, 16, 32 seconds
246
+ raise_on_status=True, # Raise exception after max retries
247
+ )
248
+
249
+ adapter = HTTPAdapter(max_retries=retry_strategy)
250
+ session.mount("http://", adapter)
251
+ session.mount("https://", adapter)
252
+
253
+ return session
254
+
230
255
  def fetch_projects(
231
256
  self,
232
257
  include_components: bool = True,
@@ -259,7 +284,7 @@ class HexApi:
259
284
  logger.debug(f"Fetching projects page with params: {params}")
260
285
  self.report.fetch_projects_page_calls += 1
261
286
  try:
262
- response = requests.get(
287
+ response = self.session.get(
263
288
  url=self._list_projects_url(),
264
289
  headers=self._auth_header(),
265
290
  params=params,
@@ -350,6 +375,7 @@ class HexApi:
350
375
  description=hex_item.description,
351
376
  created_at=hex_item.created_at,
352
377
  last_edited_at=hex_item.last_edited_at,
378
+ last_published_at=hex_item.last_published_at,
353
379
  status=status,
354
380
  categories=categories,
355
381
  collections=collections,
@@ -364,6 +390,7 @@ class HexApi:
364
390
  description=hex_item.description,
365
391
  created_at=hex_item.created_at,
366
392
  last_edited_at=hex_item.last_edited_at,
393
+ last_published_at=hex_item.last_published_at,
367
394
  status=status,
368
395
  categories=categories,
369
396
  collections=collections,
@@ -1,3 +1,4 @@
1
+ from copy import deepcopy
1
2
  from dataclasses import dataclass
2
3
  from datetime import datetime, timedelta, timezone
3
4
  from typing import Any, Dict, Iterable, List, Optional
@@ -22,6 +23,7 @@ from datahub.ingestion.api.decorators import (
22
23
  )
23
24
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
24
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
26
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
25
27
  from datahub.ingestion.source.hex.api import HexApi, HexApiReport
26
28
  from datahub.ingestion.source.hex.constants import (
27
29
  DATAHUB_API_PAGE_SIZE_DEFAULT,
@@ -44,7 +46,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
44
46
  StatefulIngestionConfigBase,
45
47
  StatefulIngestionSourceBase,
46
48
  )
47
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
48
49
  from datahub.sdk.main_client import DataHubClient
49
50
 
50
51
 
@@ -68,7 +69,7 @@ class HexSourceConfig(
68
69
  )
69
70
  include_components: bool = Field(
70
71
  default=True,
71
- desciption="Include Hex Components in the ingestion",
72
+ description="Include Hex Components in the ingestion",
72
73
  )
73
74
  page_size: int = Field(
74
75
  default=HEX_API_PAGE_SIZE_DEFAULT,
@@ -121,7 +122,11 @@ class HexSourceConfig(
121
122
 
122
123
  @root_validator(pre=True)
123
124
  def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
124
- # lineage_end_time default = now
125
+ # In-place update of the input dict would cause state contamination. This was discovered through test failures
126
+ # in test_hex.py where the same dict is reused.
127
+ # So a deepcopy is performed first.
128
+ data = deepcopy(data)
129
+
125
130
  if "lineage_end_time" not in data or data["lineage_end_time"] is None:
126
131
  data["lineage_end_time"] = datetime.now(tz=timezone.utc)
127
132
  # if string is given, parse it
@@ -166,7 +171,6 @@ class HexSourceConfig(
166
171
  class HexReport(
167
172
  StaleEntityRemovalSourceReport,
168
173
  HexApiReport,
169
- IngestionStageReport,
170
174
  HexQueryFetcherReport,
171
175
  ):
172
176
  pass
@@ -174,11 +178,18 @@ class HexReport(
174
178
 
175
179
  @platform_name("Hex")
176
180
  @config_class(HexSourceConfig)
177
- @support_status(SupportStatus.TESTING)
181
+ @support_status(SupportStatus.INCUBATING)
178
182
  @capability(SourceCapability.DESCRIPTIONS, "Supported by default")
179
183
  @capability(SourceCapability.OWNERSHIP, "Supported by default")
180
184
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
181
185
  @capability(SourceCapability.CONTAINERS, "Enabled by default")
186
+ @capability(
187
+ SourceCapability.USAGE_STATS,
188
+ "Supported by default",
189
+ subtype_modifier=[
190
+ SourceCapabilityModifier.HEX_PROJECT,
191
+ ],
192
+ )
182
193
  class HexSource(StatefulIngestionSourceBase):
183
194
  def __init__(self, config: HexSourceConfig, ctx: PipelineContext):
184
195
  super().__init__(config, ctx)
@@ -122,7 +122,7 @@ class Mapper:
122
122
  lastModified=self._change_audit_stamps(
123
123
  created_at=project.created_at, last_edited_at=project.last_edited_at
124
124
  ),
125
- externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{project.id}",
125
+ externalUrl=self._get_project_or_component_external_url(project),
126
126
  customProperties=dict(id=project.id),
127
127
  datasetEdges=self._dataset_edges(project.upstream_datasets),
128
128
  # TODO: support schema field upstream, maybe InputFields?
@@ -173,7 +173,7 @@ class Mapper:
173
173
  lastModified=self._change_audit_stamps(
174
174
  created_at=component.created_at, last_edited_at=component.last_edited_at
175
175
  ),
176
- externalUrl=f"{self._base_url}/{self._workspace_name}/hex/{component.id}",
176
+ externalUrl=self._get_project_or_component_external_url(component),
177
177
  customProperties=dict(id=component.id),
178
178
  )
179
179
 
@@ -242,6 +242,20 @@ class Mapper:
242
242
  assert isinstance(dashboard_urn, DashboardUrn)
243
243
  return dashboard_urn
244
244
 
245
+ def _get_project_or_component_external_url(
246
+ self,
247
+ project_or_component: Union[Project, Component],
248
+ ) -> Optional[str]:
249
+ if project_or_component.last_published_at is None:
250
+ return (
251
+ f"{self._base_url}/{self._workspace_name}/hex/{project_or_component.id}"
252
+ )
253
+ else:
254
+ # published Projects/Components have a different URL that everybody, not just editors, can access
255
+ return (
256
+ f"{self._base_url}/{self._workspace_name}/app/{project_or_component.id}"
257
+ )
258
+
245
259
  def _change_audit_stamps(
246
260
  self, created_at: Optional[datetime], last_edited_at: Optional[datetime]
247
261
  ) -> ChangeAuditStampsClass:
@@ -46,6 +46,7 @@ class Project:
46
46
  title: str
47
47
  description: Optional[str]
48
48
  last_edited_at: Optional[datetime] = None
49
+ last_published_at: Optional[datetime] = None
49
50
  created_at: Optional[datetime] = None
50
51
  status: Optional[Status] = None
51
52
  categories: Optional[List[Category]] = None # TODO: emit category description!
@@ -67,6 +68,7 @@ class Component:
67
68
  title: str
68
69
  description: Optional[str]
69
70
  last_edited_at: Optional[datetime] = None
71
+ last_published_at: Optional[datetime] = None
70
72
  created_at: Optional[datetime] = None
71
73
  status: Optional[Status] = None
72
74
  categories: Optional[List[Category]] = None
@@ -97,7 +97,7 @@ class HexQueryFetcher:
97
97
  if not query_urns or not entities_by_urn:
98
98
  self.report.warning(
99
99
  title="No Queries found with Hex as origin",
100
- message="No lineage because of no Queries found with Hex as origin in the given time range; you may consider extending the time range to fetch more queries.",
100
+ message="No lineage because of no Queries found with Hex as origin in the given time range. You may need to set use_queries_v2: true on your warehouse ingestion or you may consider extending the time range to fetch more queries.",
101
101
  context=str(
102
102
  dict(
103
103
  workspace_name=self.workspace_name,