acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,15 @@
1
+ import json
1
2
  import logging
2
- import textwrap
3
3
  from dataclasses import dataclass
4
- from typing import Iterable, List, Optional, Tuple
4
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
5
5
 
6
- from pydantic import Field, SecretStr
6
+ from pydantic import BaseModel, Field, SecretStr
7
7
  from slack_sdk import WebClient
8
8
  from tenacity import retry, wait_exponential
9
9
  from tenacity.before_sleep import before_sleep_log
10
10
 
11
11
  import datahub.emitter.mce_builder as builder
12
+ from datahub.emitter.mce_builder import datahub_guid, make_dataplatform_instance_urn
12
13
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
13
14
  from datahub.ingestion.api.common import PipelineContext
14
15
  from datahub.ingestion.api.decorators import (
@@ -22,6 +23,7 @@ from datahub.ingestion.api.source import (
22
23
  SourceReport,
23
24
  )
24
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
26
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
25
27
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
26
28
  StaleEntityRemovalHandler,
27
29
  StaleEntityRemovalSourceReport,
@@ -32,16 +34,153 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
32
34
  )
33
35
  from datahub.metadata.schema_classes import (
34
36
  CorpUserEditableInfoClass,
37
+ CorpUserSettingsClass,
38
+ DataPlatformInstanceClass,
39
+ DataPlatformInstancePropertiesClass,
35
40
  DatasetPropertiesClass,
36
41
  DeprecationClass,
42
+ NotificationSettingsClass,
43
+ PlatformResourceInfoClass,
44
+ SerializedValueClass,
45
+ SerializedValueContentTypeClass,
46
+ SerializedValueSchemaTypeClass,
47
+ SlackNotificationSettingsClass,
48
+ SlackUserInfoClass as SlackUserInfo,
49
+ StatusClass,
37
50
  SubTypesClass,
51
+ _Aspect,
38
52
  )
39
53
  from datahub.utilities.ratelimiter import RateLimiter
54
+ from datahub.utilities.str_enum import StrEnum
40
55
  from datahub.utilities.urns.urn import Urn
41
56
 
42
57
  logger: logging.Logger = logging.getLogger(__name__)
43
58
 
44
59
 
60
+ # TODO: Relocate this function to a utility module
61
+ def is_picture_default_or_missing(picture_link: Optional[str]) -> bool:
62
+ if not picture_link:
63
+ return True
64
+ return picture_link.endswith("default_avatar.png")
65
+
66
+
67
+ def is_slack_image(picture_link: Optional[str]) -> bool:
68
+ """
69
+ Guesses if the picture link is a slack image.
70
+ """
71
+ if not picture_link:
72
+ return False
73
+ return "slack-edge.com" in picture_link
74
+
75
+
76
+ class ResourceType(StrEnum):
77
+ USER_INFO = "user-info"
78
+ CHANNEL_INFO = "channel-info"
79
+
80
+
81
+ class SlackInstance(BaseModel):
82
+ id: str
83
+ name: Optional[str] = None
84
+ description: Optional[str] = None
85
+ external_url: Optional[str] = None
86
+ custom_properties: Optional[Dict[str, str]] = None
87
+
88
+ def to_platform_instance_urn(self) -> str:
89
+ return make_dataplatform_instance_urn(
90
+ platform=DATA_PLATFORM_SLACK_URN, instance=self.id
91
+ )
92
+
93
+ def with_slack_team_info(self, team_info: dict) -> "SlackInstance":
94
+ """
95
+ team_info looks like this
96
+ {'id': 'T22BUCL1LKW', 'name': 'DataHub', 'url': 'https://datahubspace.slack.com/', 'domain': 'datahub', 'email_domain': '', 'icon': {'image_default': False, 'image_34': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_34.png', 'image_44': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_44.png', 'image_68': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_68.png', 'image_88': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_88.png', 'image_102': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_102.png', 'image_230': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_230.png', 'image_132': 'https://avatars.slack-edge.com/2021-07-05/2228585180071_63e6f300a919abc488bb_132.png'}, 'avatar_base_url': 'https://ca.slack-edge.com/', 'is_verified': False, 'external_org_migrations': {'date_updated': 1722672564, 'current': []}, 'discoverable': 'closed', 'enterprise_id': 'E06TPM5T1G9', 'enterprise_name': 'DataHub', 'enterprise_domain': 'datahubspace', 'lob_sales_home_enabled': False}
97
+ """
98
+ self.name = team_info.get("name")
99
+ self.description = team_info.get("name")
100
+ self.external_url = team_info.get("url")
101
+ self.custom_properties = {
102
+ k: v
103
+ for k, v in {
104
+ "domain": team_info.get("domain"),
105
+ "enterprise_id": team_info.get("enterprise_id"),
106
+ "enterprise_name": team_info.get("enterprise_name"),
107
+ "enterprise_domain": team_info.get("enterprise_domain"),
108
+ "icon": team_info.get("icon", {}).get("image_102"),
109
+ }.items()
110
+ if v is not None
111
+ }
112
+ return self
113
+
114
+ def to_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
115
+ return [
116
+ MetadataChangeProposalWrapper(
117
+ entityUrn=self.to_platform_instance_urn(),
118
+ aspect=DataPlatformInstancePropertiesClass(
119
+ name=self.name or self.id,
120
+ description=self.description,
121
+ externalUrl=self.external_url or None,
122
+ customProperties=self.custom_properties or {},
123
+ ),
124
+ )
125
+ ]
126
+
127
+
128
+ def to_serialized_value(value: _Aspect) -> SerializedValueClass:
129
+ # HACK: we remove the .pegasus2avro from the schema type since we want to refer to
130
+ # the original pdl type
131
+ schema_type = value.RECORD_SCHEMA.fullname.replace(".pegasus2avro", "")
132
+ serialized_value = SerializedValueClass(
133
+ blob=json.dumps(value.to_obj()).encode("utf-8"),
134
+ contentType=SerializedValueContentTypeClass.JSON,
135
+ schemaType=SerializedValueSchemaTypeClass.PEGASUS,
136
+ schemaRef=schema_type,
137
+ )
138
+ return serialized_value
139
+
140
+
141
+ class SlackUserDetails:
142
+ def __init__(self, slack_user_info: SlackUserInfo):
143
+ self.slack_user_info = slack_user_info
144
+
145
+ def to_guid(self) -> str:
146
+ """
147
+ A slack user is uniquely identified by the combination of their id and teamId.
148
+ """
149
+ return datahub_guid(
150
+ {"id": self.slack_user_info.id, "dpi": self.slack_user_info.teamId}
151
+ )
152
+
153
+ def get_resource_urn(self) -> str:
154
+ return f"urn:li:platformResource:{self.to_guid()}"
155
+
156
+ def to_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
157
+ resource_urn = self.get_resource_urn()
158
+
159
+ dpi = DataPlatformInstanceClass(
160
+ platform=DATA_PLATFORM_SLACK_URN,
161
+ instance=self.slack_user_info.slackInstance,
162
+ )
163
+
164
+ secondary_keys = []
165
+ if self.slack_user_info.email:
166
+ secondary_keys.append(self.slack_user_info.email)
167
+
168
+ resource_info = PlatformResourceInfoClass(
169
+ resourceType=ResourceType.USER_INFO.value,
170
+ value=to_serialized_value(self.slack_user_info),
171
+ primaryKey=self.slack_user_info.id,
172
+ secondaryKeys=secondary_keys,
173
+ )
174
+
175
+ status = StatusClass(
176
+ removed=self.slack_user_info.isDeleted,
177
+ )
178
+
179
+ yield from MetadataChangeProposalWrapper.construct_many(
180
+ resource_urn, aspects=[dpi, resource_info, status]
181
+ )
182
+
183
+
45
184
  @dataclass
46
185
  class CorpUser:
47
186
  urn: Optional[str] = None
@@ -52,42 +191,43 @@ class CorpUser:
52
191
  phone: Optional[str] = None
53
192
  real_name: Optional[str] = None
54
193
  slack_display_name: Optional[str] = None
194
+ team_id: Optional[str] = None
195
+ team_domain: Optional[str] = None
196
+ is_team_enterprise: Optional[bool] = None
55
197
 
56
198
 
57
199
  class SlackSourceConfig(
58
200
  StatefulIngestionConfigBase,
59
201
  ):
60
202
  bot_token: SecretStr = Field(
61
- description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email` and `users.profile:read` scopes.",
203
+ description="Bot token for the Slack workspace. Needs `users:read`, `users:read.email`, `users.profile:read`, and `team:read` scopes.",
62
204
  )
63
205
  enrich_user_metadata: bool = Field(
64
- type=bool,
65
- default=True,
66
- description="Whether to enrich user metadata.",
206
+ True,
207
+ description="When enabled, will enrich provisioned DataHub users' metadata with information from Slack.",
208
+ )
209
+ ingest_users: bool = Field(
210
+ True,
211
+ description="Whether to ingest users. When set to true, will ingest all users in the Slack workspace (as platform resources) to simplify user enrichment after they are provisioned on DataHub.",
67
212
  )
68
213
  api_requests_per_min: int = Field(
69
- type=int,
70
- default=10,
214
+ 10,
71
215
  description="Number of API requests per minute. Low-level config. Do not tweak unless you are facing any issues.",
72
216
  )
73
217
  ingest_public_channels: bool = Field(
74
- type=bool,
75
- default=False,
218
+ False,
76
219
  description="Whether to ingest public channels. If set to true needs `channels:read` scope.",
77
220
  )
78
221
  channels_iteration_limit: int = Field(
79
- type=int,
80
- default=200,
222
+ 200,
81
223
  description="Limit the number of channels to be ingested in a iteration. Low-level config. Do not tweak unless you are facing any issues.",
82
224
  )
83
225
  channel_min_members: int = Field(
84
- type=int,
85
- default=2,
226
+ 2,
86
227
  description="Ingest channels with at least this many members.",
87
228
  )
88
229
  should_ingest_archived_channels: bool = Field(
89
- type=bool,
90
- default=False,
230
+ False,
91
231
  description="Whether to ingest archived channels.",
92
232
  )
93
233
 
@@ -96,14 +236,16 @@ class SlackSourceConfig(
96
236
  class SlackSourceReport(StaleEntityRemovalSourceReport):
97
237
  channels_reported: int = 0
98
238
  archived_channels_reported: int = 0
239
+ users_reported: int = 0
99
240
 
100
241
 
101
242
  PLATFORM_NAME = "slack"
243
+ DATA_PLATFORM_SLACK_URN: str = builder.make_data_platform_urn(PLATFORM_NAME)
102
244
 
103
245
 
104
246
  @platform_name("Slack")
105
247
  @config_class(SlackSourceConfig)
106
- @support_status(SupportStatus.TESTING)
248
+ @support_status(SupportStatus.CERTIFIED)
107
249
  class SlackSource(StatefulIngestionSourceBase):
108
250
  def __init__(self, ctx: PipelineContext, config: SlackSourceConfig):
109
251
  super().__init__(config, ctx)
@@ -124,6 +266,38 @@ class SlackSource(StatefulIngestionSourceBase):
124
266
  def get_slack_client(self) -> WebClient:
125
267
  return WebClient(token=self.config.bot_token.get_secret_value())
126
268
 
269
+ @staticmethod
270
+ def populate_slack_member_from_response(
271
+ user: Dict[str, Any], slack_instance: SlackInstance
272
+ ) -> SlackUserDetails:
273
+ profile = user.get("profile", {})
274
+
275
+ user_info = SlackUserInfo(
276
+ slackInstance=slack_instance.to_platform_instance_urn(),
277
+ id=user["id"],
278
+ name=user["name"],
279
+ realName=user.get("real_name", ""),
280
+ displayName=profile.get("display_name", ""),
281
+ email=profile.get("email"),
282
+ teamId=user["team_id"],
283
+ isDeleted=user.get("deleted", False),
284
+ isAdmin=user.get("is_admin", False),
285
+ isOwner=user.get("is_owner", False),
286
+ isPrimaryOwner=user.get("is_primary_owner", False),
287
+ isBot=user.get("is_bot", False),
288
+ timezone=user.get("tz"),
289
+ timezoneOffset=user.get("tz_offset"),
290
+ title=profile.get("title"),
291
+ phone=profile.get("phone"),
292
+ profilePictureUrl=profile.get(
293
+ "image_192"
294
+ ), # Using 192px image as an example
295
+ statusText=profile.get("status_text"),
296
+ statusEmoji=profile.get("status_emoji"),
297
+ lastUpdatedSeconds=user.get("updated"),
298
+ )
299
+ return SlackUserDetails(slack_user_info=user_info)
300
+
127
301
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
128
302
  return [
129
303
  *super().get_workunit_processors(),
@@ -143,46 +317,108 @@ class SlackSource(StatefulIngestionSourceBase):
143
317
  logger.info(auth_resp.data)
144
318
  if self.config.ingest_public_channels:
145
319
  yield from self.get_public_channels()
146
- if self.config.enrich_user_metadata:
320
+ if self.config.enrich_user_metadata or self.config.ingest_users:
147
321
  yield from self.get_user_info()
148
322
 
323
+ def _get_datahub_user_info(
324
+ self,
325
+ ) -> Dict[str, Tuple[CorpUser, Optional[CorpUserEditableInfoClass]]]:
326
+ # get_user_to_be_updated ensures that the email field is not None
327
+ users = {
328
+ user_obj.email: (user_obj, editable_properties)
329
+ for user_obj, editable_properties in self.get_user_to_be_updated()
330
+ if user_obj.email
331
+ }
332
+ return users
333
+
149
334
  def get_user_info(self) -> Iterable[MetadataWorkUnit]:
150
- assert self.ctx.graph is not None
151
- for user_obj in self.get_user_to_be_updated():
152
- self.populate_slack_id_from_email(user_obj)
153
- if user_obj.slack_id is None:
154
- continue
155
- self.populate_user_profile(user_obj)
156
- if user_obj.urn is None:
157
- continue
158
- logger.info(f"User: {user_obj}")
159
- corpuser_editable_info = (
160
- self.ctx.graph.get_aspect(
161
- entity_urn=user_obj.urn, aspect_type=CorpUserEditableInfoClass
162
- )
163
- or CorpUserEditableInfoClass()
164
- )
165
- corpuser_editable_info.email = user_obj.email
166
- corpuser_editable_info.slack = user_obj.slack_id
167
- corpuser_editable_info.title = user_obj.title
168
- if user_obj.image_url:
169
- corpuser_editable_info.pictureLink = user_obj.image_url
170
- if user_obj.phone:
171
- corpuser_editable_info.phone = user_obj.phone
172
- if (
173
- not corpuser_editable_info.displayName
174
- or corpuser_editable_info.displayName == corpuser_editable_info.email
175
- ):
176
- # let's fill out a real name
177
- corpuser_editable_info.displayName = user_obj.real_name
178
- yield MetadataWorkUnit(
179
- id=f"{user_obj.urn}",
180
- mcp=MetadataChangeProposalWrapper(
181
- entityUrn=user_obj.urn,
182
- aspect=corpuser_editable_info,
183
- ),
335
+ # Get team information to populate for all users
336
+ slack_instance: Optional[SlackInstance] = None
337
+ with self.rate_limiter:
338
+ team_response = self.get_slack_client().team_info()
339
+ if team_response and "team" in team_response:
340
+ team_info = team_response["team"]
341
+ slack_instance = SlackInstance(id=team_info.get("id"))
342
+ slack_instance = slack_instance.with_slack_team_info(team_info)
343
+
344
+ if slack_instance:
345
+ for mcp in slack_instance.to_mcps():
346
+ yield mcp.as_workunit()
347
+ else:
348
+ logger.error("Failed to fetch team information")
349
+ self.report.report_failure(
350
+ "team_info", "Failed to fetch team information for users"
184
351
  )
185
352
 
353
+ assert slack_instance
354
+
355
+ # Fetch all DataHub users that need to be updated
356
+ if self.config.enrich_user_metadata:
357
+ datahub_users = self._get_datahub_user_info()
358
+ else:
359
+ datahub_users = {}
360
+ cursor = None
361
+ while True:
362
+ with self.rate_limiter:
363
+ response = self.get_slack_client().users_list(cursor=cursor)
364
+ assert isinstance(response.data, dict)
365
+ if not response.data["ok"]:
366
+ self.report.report_failure("users", "Failed to fetch users")
367
+ return
368
+
369
+ assert self.ctx.graph is not None
370
+ for user in response.data["members"]:
371
+ # Query all slack users and ingest them into the generic
372
+ # slackMember aspect
373
+ slack_user_details: SlackUserDetails = (
374
+ self.populate_slack_member_from_response(user, slack_instance)
375
+ )
376
+ if self.config.ingest_users:
377
+ for mcp in slack_user_details.to_mcps():
378
+ yield mcp.as_workunit()
379
+
380
+ platform_resource_urn = slack_user_details.get_resource_urn()
381
+ # If user is in DataHub, compute and emit CorpUserEditableInfo
382
+ # aspect. This code will be removed once we have server side
383
+ # processing of raw slackMember aspects. This code path can also
384
+ # be turned off by setting enrich_user_metadata to False.
385
+ user_obj_props_tuple = datahub_users.get(user["profile"].get("email"))
386
+ if user_obj_props_tuple is None:
387
+ # User is not in DataHub or enrichment is disabled
388
+ continue
389
+ user_obj, editable_properties = user_obj_props_tuple
390
+ slack_user_profile = user.get("profile", {})
391
+ user_obj.slack_id = user.get("id")
392
+ user_obj.title = slack_user_profile.get("title")
393
+ user_obj.image_url = slack_user_profile.get("image_192")
394
+ user_obj.phone = slack_user_profile.get("phone")
395
+ user_obj.real_name = slack_user_profile.get("real_name")
396
+ user_obj.slack_display_name = slack_user_profile.get("display_name")
397
+ corpuser_editable_info = editable_properties or (
398
+ CorpUserEditableInfoClass()
399
+ )
400
+ emittable_corpuser_editable_info = self.populate_corpuser_editable_info(
401
+ corpuser_editable_info,
402
+ user_obj,
403
+ platform_resource_urn=platform_resource_urn,
404
+ slack_instance=slack_instance,
405
+ )
406
+ if emittable_corpuser_editable_info:
407
+ yield MetadataChangeProposalWrapper(
408
+ entityUrn=user_obj.urn, aspect=emittable_corpuser_editable_info
409
+ ).as_workunit()
410
+ # if we update corpusereditable info, we also update
411
+ # slackuserinfo. This will be removed once we have server
412
+ # side processing of raw slackMember aspects.
413
+ yield MetadataChangeProposalWrapper(
414
+ entityUrn=user_obj.urn,
415
+ aspect=slack_user_details.slack_user_info,
416
+ ).as_workunit()
417
+ yield from self.emit_corp_user_slack_settings(user_obj)
418
+ cursor = str(response.data["response_metadata"]["next_cursor"])
419
+ if not cursor:
420
+ break
421
+
186
422
  def _get_channel_info(
187
423
  self, cursor: Optional[str]
188
424
  ) -> Tuple[List[MetadataWorkUnit], Optional[str]]:
@@ -251,7 +487,7 @@ class SlackSource(StatefulIngestionSourceBase):
251
487
  mcp=MetadataChangeProposalWrapper(
252
488
  entityUrn=urn_channel,
253
489
  aspect=SubTypesClass(
254
- typeNames=["Slack Channel"],
490
+ typeNames=[DatasetSubTypes.SLACK_CHANNEL],
255
491
  ),
256
492
  ),
257
493
  )
@@ -259,6 +495,58 @@ class SlackSource(StatefulIngestionSourceBase):
259
495
  cursor = str(response.data["response_metadata"]["next_cursor"])
260
496
  return result_channels, cursor
261
497
 
498
+ def populate_corpuser_editable_info(
499
+ self,
500
+ corpuser_editable_info: CorpUserEditableInfoClass,
501
+ user_obj: CorpUser,
502
+ platform_resource_urn: str,
503
+ slack_instance: SlackInstance,
504
+ ) -> Optional[CorpUserEditableInfoClass]:
505
+ """
506
+ Populate CorpUserEditableInfo aspect with user information from Slack.
507
+ If changes are not required, None is returned.
508
+ If changes are required, the updated aspect is returned.
509
+ """
510
+ mutation_required = False
511
+ if not corpuser_editable_info.email and user_obj.email:
512
+ mutation_required = True
513
+ corpuser_editable_info.email = user_obj.email
514
+ if not corpuser_editable_info.slack and user_obj.slack_id:
515
+ mutation_required = True
516
+ corpuser_editable_info.slack = user_obj.slack_id
517
+ if not corpuser_editable_info.title and user_obj.title:
518
+ mutation_required = True
519
+ corpuser_editable_info.title = user_obj.title
520
+ if user_obj.image_url and (
521
+ is_picture_default_or_missing(corpuser_editable_info.pictureLink)
522
+ or (
523
+ is_slack_image(corpuser_editable_info.pictureLink)
524
+ and user_obj.image_url != corpuser_editable_info.pictureLink
525
+ )
526
+ ):
527
+ mutation_required = True
528
+ corpuser_editable_info.pictureLink = user_obj.image_url
529
+ if user_obj.phone and not corpuser_editable_info.phone:
530
+ mutation_required = True
531
+ corpuser_editable_info.phone = user_obj.phone
532
+ if (
533
+ not corpuser_editable_info.displayName
534
+ or corpuser_editable_info.displayName == corpuser_editable_info.email
535
+ ) and user_obj.real_name:
536
+ mutation_required = True
537
+ corpuser_editable_info.displayName = user_obj.real_name
538
+ if mutation_required:
539
+ # update informationSources
540
+ corpuser_editable_info.informationSources = (
541
+ []
542
+ if not corpuser_editable_info.informationSources
543
+ else corpuser_editable_info.informationSources
544
+ )
545
+ if platform_resource_urn not in corpuser_editable_info.informationSources:
546
+ corpuser_editable_info.informationSources.append(platform_resource_urn)
547
+ return corpuser_editable_info
548
+ return None
549
+
262
550
  def get_public_channels(self) -> Iterable[MetadataWorkUnit]:
263
551
  cursor = None
264
552
  while True:
@@ -270,103 +558,78 @@ class SlackSource(StatefulIngestionSourceBase):
270
558
  if not cursor:
271
559
  break
272
560
 
273
- def populate_user_profile(self, user_obj: CorpUser) -> None:
274
- if not user_obj.slack_id:
561
+ def emit_slack_member_aspect(
562
+ self, user: SlackUserInfo
563
+ ) -> Iterable[MetadataWorkUnit]:
564
+ slack_user = SlackUserDetails(slack_user_info=user)
565
+ for mcp in slack_user.to_mcps():
566
+ yield mcp.as_workunit()
567
+
568
+ def emit_corp_user_slack_settings(
569
+ self, user_obj: CorpUser
570
+ ) -> Iterable[MetadataWorkUnit]:
571
+ assert self.ctx.graph is not None
572
+
573
+ if not user_obj.urn:
275
574
  return
276
- try:
277
- # https://api.slack.com/methods/users.profile.get
278
- with self.rate_limiter:
279
- if self._use_users_info:
280
- user_profile_res = self.get_slack_client().users_info(
281
- user=user_obj.slack_id
282
- )
283
- user_profile_res = user_profile_res.get("user", {})
284
- else:
285
- user_profile_res = self.get_slack_client().users_profile_get(
286
- user=user_obj.slack_id
287
- )
288
- logger.debug(f"User profile: {user_profile_res}")
289
- user_profile = user_profile_res.get("profile", {})
290
- user_obj.title = user_profile.get("title")
291
- user_obj.image_url = user_profile.get("image_192")
292
- user_obj.phone = user_profile.get("phone")
293
- user_obj.real_name = user_profile.get("real_name")
294
- user_obj.slack_display_name = user_profile.get("display_name")
295
-
296
- except Exception as e:
297
- if "missing_scope" in str(e):
298
- if self._use_users_info:
299
- raise e
300
- self._use_users_info = True
301
- self.populate_user_profile(user_obj)
575
+
576
+ corp_user_settings = self.ctx.graph.get_aspect(
577
+ user_obj.urn, CorpUserSettingsClass
578
+ )
579
+ if not corp_user_settings:
302
580
  return
303
581
 
304
- def populate_slack_id_from_email(self, user_obj: CorpUser) -> None:
305
- if user_obj.email is None:
582
+ notification_settings = corp_user_settings.notificationSettings
583
+
584
+ if not notification_settings:
585
+ corp_user_settings.notificationSettings = NotificationSettingsClass(
586
+ sinkTypes=[],
587
+ slackSettings=SlackNotificationSettingsClass(
588
+ userHandle=user_obj.slack_id
589
+ ),
590
+ )
591
+ elif (
592
+ not notification_settings.slackSettings
593
+ or not notification_settings.slackSettings.userHandle
594
+ ):
595
+ notification_settings.slackSettings = SlackNotificationSettingsClass(
596
+ userHandle=user_obj.slack_id
597
+ )
598
+ else:
306
599
  return
307
- try:
308
- # https://api.slack.com/methods/users.lookupByEmail
309
- with self.rate_limiter:
310
- user_info_res = self.get_slack_client().users_lookupByEmail(
311
- email=user_obj.email
312
- )
313
- user_info = user_info_res.get("user", {})
314
- user_obj.slack_id = user_info.get("id")
315
- except Exception as e:
316
- if "users_not_found" in str(e):
317
- return
318
- raise e
600
+
601
+ yield MetadataWorkUnit(
602
+ id=f"{user_obj.urn}",
603
+ mcp=MetadataChangeProposalWrapper(
604
+ entityUrn=user_obj.urn,
605
+ aspect=corp_user_settings,
606
+ ),
607
+ )
319
608
 
320
609
  @retry(
321
610
  wait=wait_exponential(multiplier=2, min=4, max=60),
322
611
  before_sleep=before_sleep_log(logger, logging.ERROR, True),
323
612
  )
324
- def get_user_to_be_updated(self) -> Iterable[CorpUser]:
325
- graphql_query = textwrap.dedent(
326
- """
327
- query listUsers($input: ListUsersInput!) {
328
- listUsers(input: $input) {
329
- total
330
- users {
331
- urn
332
- editableProperties {
333
- email
334
- slack
335
- }
336
- }
337
- }
338
- }
339
- """
340
- )
341
- start = 0
342
- count = 10
343
- total = count
344
-
613
+ def get_user_to_be_updated(
614
+ self,
615
+ ) -> Iterable[Tuple[CorpUser, Optional[CorpUserEditableInfoClass]]]:
345
616
  assert self.ctx.graph is not None
346
-
347
- while start < total:
348
- variables = {"input": {"start": start, "count": count}}
349
- response = self.ctx.graph.execute_graphql(
350
- query=graphql_query, variables=variables
617
+ for urn in self.ctx.graph.get_urns_by_filter(
618
+ entity_types=["corpuser"], query="*"
619
+ ):
620
+ user_obj = CorpUser()
621
+ user_obj.urn = urn
622
+ editable_properties = self.ctx.graph.get_aspect(
623
+ urn, CorpUserEditableInfoClass
351
624
  )
352
- list_users = response.get("listUsers", {})
353
- total = list_users.get("total", 0)
354
- users = list_users.get("users", [])
355
- for user in users:
356
- user_obj = CorpUser()
357
- editable_properties = user.get("editableProperties", {})
358
- user_obj.urn = user.get("urn")
359
- if user_obj.urn is None:
360
- continue
361
- if editable_properties is not None:
362
- user_obj.email = editable_properties.get("email")
363
- if user_obj.email is None:
364
- urn_id = Urn.from_string(user_obj.urn).get_entity_id_as_string()
365
- if "@" in urn_id:
366
- user_obj.email = urn_id
367
- if user_obj.email is not None:
368
- yield user_obj
369
- start += count
625
+ if editable_properties and editable_properties.email:
626
+ user_obj.email = editable_properties.email
627
+ else:
628
+ urn_id = Urn.from_string(user_obj.urn).get_entity_id_as_string()
629
+ if "@" in urn_id:
630
+ user_obj.email = urn_id
631
+ if user_obj.email is not None:
632
+ yield (user_obj, editable_properties)
370
633
 
371
634
  def get_report(self) -> SourceReport:
372
635
  return self.report
File without changes