acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -3,6 +3,7 @@ import logging
3
3
  import re
4
4
  import time
5
5
  from collections import OrderedDict, defaultdict
6
+ from copy import deepcopy
6
7
  from dataclasses import dataclass, field as dataclass_field
7
8
  from datetime import datetime, timedelta, timezone
8
9
  from functools import lru_cache
@@ -12,6 +13,7 @@ from typing import (
12
13
  Dict,
13
14
  Iterable,
14
15
  List,
16
+ Literal,
15
17
  Optional,
16
18
  Set,
17
19
  Tuple,
@@ -79,6 +81,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
79
81
  from datahub.ingestion.source.common.subtypes import (
80
82
  BIContainerSubTypes,
81
83
  DatasetSubTypes,
84
+ SourceCapabilityModifier,
82
85
  )
83
86
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
84
87
  StaleEntityRemovalHandler,
@@ -117,7 +120,6 @@ from datahub.ingestion.source.tableau.tableau_common import (
117
120
  )
118
121
  from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
119
122
  from datahub.ingestion.source.tableau.tableau_validation import check_user_role
120
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
121
123
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
122
124
  AuditStamp,
123
125
  ChangeAuditStamps,
@@ -147,7 +149,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
147
149
  )
148
150
  from datahub.metadata.schema_classes import (
149
151
  BrowsePathsClass,
150
- ChangeTypeClass,
151
152
  ChartInfoClass,
152
153
  ChartUsageStatisticsClass,
153
154
  DashboardInfoClass,
@@ -473,6 +474,13 @@ class TableauPageSizeConfig(ConfigModel):
473
474
  return self.database_table_page_size or self.page_size
474
475
 
475
476
 
477
+ _IngestHiddenAssetsOptionsType = Literal["worksheet", "dashboard"]
478
+ _IngestHiddenAssetsOptions: List[_IngestHiddenAssetsOptionsType] = [
479
+ "worksheet",
480
+ "dashboard",
481
+ ]
482
+
483
+
476
484
  class TableauConfig(
477
485
  DatasetLineageProviderConfigBase,
478
486
  StatefulIngestionConfigBase,
@@ -523,10 +531,22 @@ class TableauConfig(
523
531
  default=False,
524
532
  description="Ingest Owner from source. This will override Owner info entered from UI",
525
533
  )
534
+ use_email_as_username: bool = Field(
535
+ default=False,
536
+ description="Use email address instead of username for entity owners. Requires ingest_owner to be True.",
537
+ )
526
538
  ingest_tables_external: bool = Field(
527
539
  default=False,
528
540
  description="Ingest details for tables external to (not embedded in) tableau as entities.",
529
541
  )
542
+ emit_all_published_datasources: bool = Field(
543
+ default=False,
544
+ description="Ingest all published data sources. When False (default), only ingest published data sources that belong to an ingested workbook.",
545
+ )
546
+ emit_all_embedded_datasources: bool = Field(
547
+ default=False,
548
+ description="Ingest all embedded data sources. When False (default), only ingest embedded data sources that belong to an ingested workbook.",
549
+ )
530
550
 
531
551
  env: str = Field(
532
552
  default=builder.DEFAULT_ENV,
@@ -573,13 +593,13 @@ class TableauConfig(
573
593
  )
574
594
 
575
595
  extract_lineage_from_unsupported_custom_sql_queries: bool = Field(
576
- default=False,
577
- description="[Experimental] Whether to extract lineage from unsupported custom sql queries using SQL parsing",
596
+ default=True,
597
+ description="[Experimental] Extract lineage from Custom SQL queries using DataHub's SQL parser in cases where the Tableau Catalog API fails to return lineage for the query.",
578
598
  )
579
599
 
580
600
  force_extraction_of_lineage_from_custom_sql_queries: bool = Field(
581
601
  default=False,
582
- description="[Experimental] Force extraction of lineage from custom sql queries using SQL parsing, ignoring Tableau metadata",
602
+ description="[Experimental] Force extraction of lineage from Custom SQL queries using DataHub's SQL parser, even when the Tableau Catalog API returns lineage already.",
583
603
  )
584
604
 
585
605
  sql_parsing_disable_schema_awareness: bool = Field(
@@ -612,10 +632,14 @@ class TableauConfig(
612
632
  description="Configuration settings for ingesting Tableau groups and their capabilities as custom properties.",
613
633
  )
614
634
 
615
- ingest_hidden_assets: bool = Field(
616
- True,
617
- description="When enabled, hidden views and dashboards are ingested into Datahub. "
618
- "If a dashboard or view is hidden in Tableau the luid is blank. Default of this config field is True.",
635
+ ingest_hidden_assets: Union[List[_IngestHiddenAssetsOptionsType], bool] = Field(
636
+ _IngestHiddenAssetsOptions,
637
+ description=(
638
+ "When enabled, hidden worksheets and dashboards are ingested into Datahub."
639
+ " If a dashboard or worksheet is hidden in Tableau the luid is blank."
640
+ " A list of asset types can also be specified, to only ingest those hidden assets."
641
+ " Current options supported are 'worksheet' and 'dashboard'."
642
+ ),
619
643
  )
620
644
 
621
645
  tags_for_hidden_assets: List[str] = Field(
@@ -631,6 +655,11 @@ class TableauConfig(
631
655
  # pre = True because we want to take some decision before pydantic initialize the configuration to default values
632
656
  @root_validator(pre=True)
633
657
  def projects_backward_compatibility(cls, values: Dict) -> Dict:
658
+ # In-place update of the input dict would cause state contamination. This was discovered through test failures
659
+ # in test_hex.py where the same dict is reused.
660
+ # So a copy is performed first.
661
+ values = deepcopy(values)
662
+
634
663
  projects = values.get("projects")
635
664
  project_pattern = values.get("project_pattern")
636
665
  project_path_pattern = values.get("project_path_pattern")
@@ -642,6 +671,7 @@ class TableauConfig(
642
671
  values["project_pattern"] = AllowDenyPattern(
643
672
  allow=[f"^{prj}$" for prj in projects]
644
673
  )
674
+ values.pop("projects")
645
675
  elif (project_pattern or project_path_pattern) and projects:
646
676
  raise ValueError(
647
677
  "projects is deprecated. Please use project_path_pattern only."
@@ -653,7 +683,7 @@ class TableauConfig(
653
683
 
654
684
  return values
655
685
 
656
- @root_validator()
686
+ @root_validator(skip_on_failure=True)
657
687
  def validate_config_values(cls, values: Dict) -> Dict:
658
688
  tags_for_hidden_assets = values.get("tags_for_hidden_assets")
659
689
  ingest_tags = values.get("ingest_tags")
@@ -665,6 +695,14 @@ class TableauConfig(
665
695
  raise ValueError(
666
696
  "tags_for_hidden_assets is only allowed with ingest_tags enabled. Be aware that this will overwrite tags entered from the UI."
667
697
  )
698
+
699
+ use_email_as_username = values.get("use_email_as_username")
700
+ ingest_owner = values.get("ingest_owner")
701
+ if use_email_as_username and not ingest_owner:
702
+ raise ValueError(
703
+ "use_email_as_username requires ingest_owner to be enabled."
704
+ )
705
+
668
706
  return values
669
707
 
670
708
 
@@ -756,7 +794,6 @@ class SiteIdContentUrl:
756
794
  @dataclass
757
795
  class TableauSourceReport(
758
796
  StaleEntityRemovalSourceReport,
759
- IngestionStageReport,
760
797
  ):
761
798
  get_all_datasources_query_failed: bool = False
762
799
  num_get_datasource_query_failures: int = 0
@@ -826,6 +863,9 @@ class TableauSourceReport(
826
863
  default_factory=(lambda: defaultdict(int))
827
864
  )
828
865
 
866
+ # Owner extraction statistics
867
+ num_email_fallback_to_username: int = 0
868
+
829
869
 
830
870
  def report_user_role(report: TableauSourceReport, server: Server) -> None:
831
871
  title: str = "Insufficient Permissions"
@@ -856,16 +896,29 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
856
896
  @platform_name("Tableau")
857
897
  @config_class(TableauConfig)
858
898
  @support_status(SupportStatus.CERTIFIED)
899
+ @capability(
900
+ SourceCapability.CONTAINERS,
901
+ "Enabled by default",
902
+ subtype_modifier=[
903
+ SourceCapabilityModifier.TABLEAU_PROJECT,
904
+ SourceCapabilityModifier.TABLEAU_SITE,
905
+ SourceCapabilityModifier.TABLEAU_WORKBOOK,
906
+ ],
907
+ )
859
908
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
860
909
  @capability(SourceCapability.DOMAINS, "Requires transformer", supported=False)
861
910
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
862
911
  @capability(
863
912
  SourceCapability.USAGE_STATS,
864
913
  "Dashboard/Chart view counts, enabled using extract_usage_stats config",
914
+ subtype_modifier=[
915
+ SourceCapabilityModifier.DASHBOARD,
916
+ SourceCapabilityModifier.CHART,
917
+ ],
865
918
  )
866
919
  @capability(
867
920
  SourceCapability.DELETION_DETECTION,
868
- "Enabled by default when stateful ingestion is turned on.",
921
+ "Enabled by default via stateful ingestion.",
869
922
  )
870
923
  @capability(SourceCapability.OWNERSHIP, "Requires recipe configuration")
871
924
  @capability(SourceCapability.TAGS, "Requires recipe configuration")
@@ -874,6 +927,7 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
874
927
  SourceCapability.LINEAGE_FINE,
875
928
  "Enabled by default, configure using `extract_column_level_lineage`",
876
929
  )
930
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
877
931
  class TableauSource(StatefulIngestionSourceBase, TestableSource):
878
932
  platform = "tableau"
879
933
 
@@ -1157,7 +1211,7 @@ class TableauSiteSource:
1157
1211
  self.report.warning(
1158
1212
  title="Incomplete project hierarchy",
1159
1213
  message="Project details missing. Child projects will be ingested without reference to their parent project. We generally need Site Administrator Explorer permissions to extract the complete project hierarchy.",
1160
- context=f"Missing {project.parent_id}, referenced by {project.id} {project.project_name}",
1214
+ context=f"Missing {project.parent_id}, referenced by {project.id} {project.name}",
1161
1215
  )
1162
1216
  project.parent_id = None
1163
1217
 
@@ -1348,6 +1402,26 @@ class TableauSiteSource:
1348
1402
  # More info here: https://help.tableau.com/current/api/metadata_api/en-us/reference/view.doc.html
1349
1403
  return not dashboard_or_view.get(c.LUID)
1350
1404
 
1405
+ def _should_ingest_worksheet(self, worksheet: Dict) -> bool:
1406
+ return (
1407
+ self.config.ingest_hidden_assets is True
1408
+ or (
1409
+ isinstance(self.config.ingest_hidden_assets, list)
1410
+ and "worksheet" in self.config.ingest_hidden_assets
1411
+ )
1412
+ or not self._is_hidden_view(worksheet)
1413
+ )
1414
+
1415
+ def _should_ingest_dashboard(self, dashboard: Dict) -> bool:
1416
+ return (
1417
+ self.config.ingest_hidden_assets is True
1418
+ or (
1419
+ isinstance(self.config.ingest_hidden_assets, list)
1420
+ and "dashboard" in self.config.ingest_hidden_assets
1421
+ )
1422
+ or not self._is_hidden_view(dashboard)
1423
+ )
1424
+
1351
1425
  def get_connection_object_page(
1352
1426
  self,
1353
1427
  query: str,
@@ -1369,7 +1443,9 @@ class TableauSiteSource:
1369
1443
  `fetch_size:` The number of records to retrieve from Tableau
1370
1444
  Server in a single API call, starting from the current cursor position on Tableau Server.
1371
1445
  """
1372
- retries_remaining = retries_remaining or self.config.max_retries
1446
+ retries_remaining = (
1447
+ self.config.max_retries if retries_remaining is None else retries_remaining
1448
+ )
1373
1449
 
1374
1450
  logger.debug(
1375
1451
  f"Query {connection_type} to get {fetch_size} objects with cursor {current_cursor}"
@@ -1512,12 +1588,15 @@ class TableauSiteSource:
1512
1588
  }}""",
1513
1589
  )
1514
1590
  else:
1515
- # As of Tableau Server 2024.2, the metadata API sporadically returns a 30-second
1516
- # timeout error.
1517
- # It doesn't reliably happen, so retrying a couple of times makes sense.
1518
1591
  if all(
1592
+ # As of Tableau Server 2024.2, the metadata API sporadically returns a 30-second
1593
+ # timeout error.
1594
+ # It doesn't reliably happen, so retrying a couple of times makes sense.
1519
1595
  error.get("message")
1520
1596
  == "Execution canceled because timeout of 30000 millis was reached"
1597
+ # The Metadata API sometimes returns an 'unexpected error' message when querying
1598
+ # embeddedDatasourcesConnection. Try retrying a couple of times.
1599
+ or error.get("message") == "Unexpected error occurred"
1521
1600
  for error in errors
1522
1601
  ):
1523
1602
  # If it was only a timeout error, we can retry.
@@ -1529,8 +1608,8 @@ class TableauSiteSource:
1529
1608
  (self.config.max_retries - retries_remaining + 1) ** 2, 60
1530
1609
  )
1531
1610
  logger.info(
1532
- f"Query {connection_type} received a 30 second timeout error - will retry in {backoff_time} seconds. "
1533
- f"Retries remaining: {retries_remaining}"
1611
+ f"Query {connection_type} received a retryable error with {retries_remaining} retries remaining, "
1612
+ f"will retry in {backoff_time} seconds: {errors}"
1534
1613
  )
1535
1614
  time.sleep(backoff_time)
1536
1615
  return self.get_connection_object_page(
@@ -1540,7 +1619,7 @@ class TableauSiteSource:
1540
1619
  fetch_size=fetch_size,
1541
1620
  current_cursor=current_cursor,
1542
1621
  retry_on_auth_error=True,
1543
- retries_remaining=retries_remaining,
1622
+ retries_remaining=retries_remaining - 1,
1544
1623
  )
1545
1624
  raise RuntimeError(f"Query {connection_type} error: {errors}")
1546
1625
 
@@ -1623,7 +1702,7 @@ class TableauSiteSource:
1623
1702
  # if multiple project has name C. Ideal solution is to use projectLuidWithin to avoid duplicate project,
1624
1703
  # however Tableau supports projectLuidWithin in Tableau Cloud June 2022 / Server 2022.3 and later.
1625
1704
  project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
1626
- if project_luid not in self.tableau_project_registry.keys():
1705
+ if project_luid not in self.tableau_project_registry:
1627
1706
  wrk_name: Optional[str] = workbook.get(c.NAME)
1628
1707
  wrk_id: Optional[str] = workbook.get(c.ID)
1629
1708
  prj_name: Optional[str] = workbook.get(c.PROJECT_NAME)
@@ -2147,32 +2226,32 @@ class TableauSiteSource:
2147
2226
  else []
2148
2227
  )
2149
2228
 
2150
- # The Tableau SQL parser much worse than our sqlglot based parser,
2151
- # so relying on metadata parsed by Tableau from SQL queries can be
2152
- # less accurate. This option allows us to ignore Tableau's parser and
2153
- # only use our own.
2154
- if self.config.force_extraction_of_lineage_from_custom_sql_queries:
2155
- logger.debug("Extracting TLL & CLL from custom sql (forced)")
2229
+ tableau_table_list = csql.get(c.TABLES, [])
2230
+ if self.config.force_extraction_of_lineage_from_custom_sql_queries or (
2231
+ not tableau_table_list
2232
+ and self.config.extract_lineage_from_unsupported_custom_sql_queries
2233
+ ):
2234
+ if not tableau_table_list:
2235
+ # custom sql tables may contain unsupported sql, causing incomplete lineage
2236
+ # we extract the lineage from the raw queries
2237
+ logger.debug(
2238
+ "Parsing TLL & CLL from custom sql (tableau metadata incomplete)"
2239
+ )
2240
+ else:
2241
+ # The Tableau SQL parser is much worse than our sqlglot based parser,
2242
+ # so relying on metadata parsed by Tableau from SQL queries can be
2243
+ # less accurate. This option allows us to ignore Tableau's parser and
2244
+ # only use our own.
2245
+ logger.debug("Parsing TLL & CLL from custom sql (forced)")
2246
+
2156
2247
  yield from self._create_lineage_from_unsupported_csql(
2157
2248
  csql_urn, csql, columns
2158
2249
  )
2159
2250
  else:
2160
- tables = csql.get(c.TABLES, [])
2161
-
2162
- if tables:
2163
- # lineage from custom sql -> datasets/tables #
2164
- yield from self._create_lineage_to_upstream_tables(
2165
- csql_urn, tables, datasource
2166
- )
2167
- elif (
2168
- self.config.extract_lineage_from_unsupported_custom_sql_queries
2169
- ):
2170
- logger.debug("Extracting TLL & CLL from custom sql")
2171
- # custom sql tables may contain unsupported sql, causing incomplete lineage
2172
- # we extract the lineage from the raw queries
2173
- yield from self._create_lineage_from_unsupported_csql(
2174
- csql_urn, csql, columns
2175
- )
2251
+ # lineage from custom sql -> datasets/tables #
2252
+ yield from self._create_lineage_to_upstream_tables(
2253
+ csql_urn, tableau_table_list, datasource
2254
+ )
2176
2255
 
2177
2256
  # Schema Metadata
2178
2257
  schema_metadata = self.get_schema_metadata_for_custom_sql(columns)
@@ -2210,7 +2289,6 @@ class TableauSiteSource:
2210
2289
  yield self.get_metadata_change_event(dataset_snapshot)
2211
2290
  yield self.get_metadata_change_proposal(
2212
2291
  dataset_snapshot.urn,
2213
- aspect_name=c.SUB_TYPES,
2214
2292
  aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW, c.CUSTOM_SQL]),
2215
2293
  )
2216
2294
 
@@ -2253,7 +2331,7 @@ class TableauSiteSource:
2253
2331
  # It is possible due to https://github.com/tableau/server-client-python/issues/1210
2254
2332
  if (
2255
2333
  ds.get(c.LUID)
2256
- and ds[c.LUID] not in self.datasource_project_map.keys()
2334
+ and ds[c.LUID] not in self.datasource_project_map
2257
2335
  and self.report.get_all_datasources_query_failed
2258
2336
  ):
2259
2337
  logger.debug(
@@ -2265,7 +2343,7 @@ class TableauSiteSource:
2265
2343
 
2266
2344
  if (
2267
2345
  ds.get(c.LUID)
2268
- and ds[c.LUID] in self.datasource_project_map.keys()
2346
+ and ds[c.LUID] in self.datasource_project_map
2269
2347
  and self.datasource_project_map[ds[c.LUID]] in self.tableau_project_registry
2270
2348
  ):
2271
2349
  return self.datasource_project_map[ds[c.LUID]]
@@ -2375,7 +2453,6 @@ class TableauSiteSource:
2375
2453
  upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
2376
2454
  yield self.get_metadata_change_proposal(
2377
2455
  csql_urn,
2378
- aspect_name=c.UPSTREAM_LINEAGE,
2379
2456
  aspect=upstream_lineage,
2380
2457
  )
2381
2458
  self.report.num_tables_with_upstream_lineage += 1
@@ -2561,7 +2638,6 @@ class TableauSiteSource:
2561
2638
  )
2562
2639
  yield self.get_metadata_change_proposal(
2563
2640
  csql_urn,
2564
- aspect_name=c.UPSTREAM_LINEAGE,
2565
2641
  aspect=upstream_lineage,
2566
2642
  )
2567
2643
  self.report.num_tables_with_upstream_lineage += 1
@@ -2607,14 +2683,10 @@ class TableauSiteSource:
2607
2683
  def get_metadata_change_proposal(
2608
2684
  self,
2609
2685
  urn: str,
2610
- aspect_name: str,
2611
2686
  aspect: Union["UpstreamLineage", "SubTypesClass"],
2612
2687
  ) -> MetadataWorkUnit:
2613
2688
  return MetadataChangeProposalWrapper(
2614
- entityType=c.DATASET,
2615
- changeType=ChangeTypeClass.UPSERT,
2616
2689
  entityUrn=urn,
2617
- aspectName=aspect_name,
2618
2690
  aspect=aspect,
2619
2691
  ).as_workunit()
2620
2692
 
@@ -2671,13 +2743,12 @@ class TableauSiteSource:
2671
2743
  dataset_snapshot.aspects.append(browse_paths)
2672
2744
 
2673
2745
  # Ownership
2674
- owner = (
2675
- self._get_ownership(datasource_info[c.OWNER][c.USERNAME])
2676
- if datasource_info
2677
- and datasource_info.get(c.OWNER)
2678
- and datasource_info[c.OWNER].get(c.USERNAME)
2746
+ owner_identifier = (
2747
+ self._get_owner_identifier(datasource_info[c.OWNER])
2748
+ if datasource_info and datasource_info.get(c.OWNER)
2679
2749
  else None
2680
2750
  )
2751
+ owner = self._get_ownership(owner_identifier) if owner_identifier else None
2681
2752
  if owner is not None:
2682
2753
  dataset_snapshot.aspects.append(owner)
2683
2754
 
@@ -2722,7 +2793,6 @@ class TableauSiteSource:
2722
2793
  )
2723
2794
  yield self.get_metadata_change_proposal(
2724
2795
  datasource_urn,
2725
- aspect_name=c.UPSTREAM_LINEAGE,
2726
2796
  aspect=upstream_lineage,
2727
2797
  )
2728
2798
  self.report.num_tables_with_upstream_lineage += 1
@@ -2741,7 +2811,6 @@ class TableauSiteSource:
2741
2811
  yield self.get_metadata_change_event(dataset_snapshot)
2742
2812
  yield self.get_metadata_change_proposal(
2743
2813
  dataset_snapshot.urn,
2744
- aspect_name=c.SUB_TYPES,
2745
2814
  aspect=SubTypesClass(
2746
2815
  typeNames=(
2747
2816
  ["Embedded Data Source"]
@@ -2827,7 +2896,11 @@ class TableauSiteSource:
2827
2896
  return datasource
2828
2897
 
2829
2898
  def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
2830
- datasource_filter = {c.ID_WITH_IN: self.datasource_ids_being_used}
2899
+ datasource_filter = (
2900
+ {}
2901
+ if self.config.emit_all_published_datasources
2902
+ else {c.ID_WITH_IN: self.datasource_ids_being_used}
2903
+ )
2831
2904
 
2832
2905
  for datasource in self.get_connection_objects(
2833
2906
  query=published_datasource_graphql_query,
@@ -3059,7 +3132,7 @@ class TableauSiteSource:
3059
3132
  query_filter=sheets_filter,
3060
3133
  page_size=self.config.effective_sheet_page_size,
3061
3134
  ):
3062
- if self.config.ingest_hidden_assets or not self._is_hidden_view(sheet):
3135
+ if self._should_ingest_worksheet(sheet):
3063
3136
  yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK))
3064
3137
  else:
3065
3138
  self.report.num_hidden_assets_skipped += 1
@@ -3080,7 +3153,7 @@ class TableauSiteSource:
3080
3153
 
3081
3154
  creator: Optional[str] = None
3082
3155
  if workbook is not None and workbook.get(c.OWNER) is not None:
3083
- creator = workbook[c.OWNER].get(c.USERNAME)
3156
+ creator = self._get_owner_identifier(workbook[c.OWNER])
3084
3157
  created_at = sheet.get(c.CREATED_AT, datetime.now())
3085
3158
  updated_at = sheet.get(c.UPDATED_AT, datetime.now())
3086
3159
  last_modified = self.get_last_modified(creator, created_at, updated_at)
@@ -3229,7 +3302,7 @@ class TableauSiteSource:
3229
3302
 
3230
3303
  def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUnit]:
3231
3304
  workbook_container_key = self.gen_workbook_key(workbook[c.ID])
3232
- creator = workbook.get(c.OWNER, {}).get(c.USERNAME)
3305
+ creator = self._get_owner_identifier(workbook.get(c.OWNER, {}))
3233
3306
 
3234
3307
  owner_urn = (
3235
3308
  builder.make_user_urn(creator)
@@ -3252,7 +3325,7 @@ class TableauSiteSource:
3252
3325
 
3253
3326
  parent_key = None
3254
3327
  project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
3255
- if project_luid and project_luid in self.tableau_project_registry.keys():
3328
+ if project_luid and project_luid in self.tableau_project_registry:
3256
3329
  parent_key = self.gen_project_key(project_luid)
3257
3330
  else:
3258
3331
  workbook_id: Optional[str] = workbook.get(c.ID)
@@ -3380,7 +3453,7 @@ class TableauSiteSource:
3380
3453
  query_filter=dashboards_filter,
3381
3454
  page_size=self.config.effective_dashboard_page_size,
3382
3455
  ):
3383
- if self.config.ingest_hidden_assets or not self._is_hidden_view(dashboard):
3456
+ if self._should_ingest_dashboard(dashboard):
3384
3457
  yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK))
3385
3458
  else:
3386
3459
  self.report.num_hidden_assets_skipped += 1
@@ -3411,7 +3484,7 @@ class TableauSiteSource:
3411
3484
 
3412
3485
  creator: Optional[str] = None
3413
3486
  if workbook is not None and workbook.get(c.OWNER) is not None:
3414
- creator = workbook[c.OWNER].get(c.USERNAME)
3487
+ creator = self._get_owner_identifier(workbook[c.OWNER])
3415
3488
  created_at = dashboard.get(c.CREATED_AT, datetime.now())
3416
3489
  updated_at = dashboard.get(c.UPDATED_AT, datetime.now())
3417
3490
  last_modified = self.get_last_modified(creator, created_at, updated_at)
@@ -3520,7 +3593,11 @@ class TableauSiteSource:
3520
3593
  return browse_paths
3521
3594
 
3522
3595
  def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]:
3523
- datasource_filter = {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
3596
+ datasource_filter = (
3597
+ {}
3598
+ if self.config.emit_all_embedded_datasources
3599
+ else {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
3600
+ )
3524
3601
 
3525
3602
  for datasource in self.get_connection_objects(
3526
3603
  query=embedded_datasource_graphql_query,
@@ -3554,6 +3631,20 @@ class TableauSiteSource:
3554
3631
  )
3555
3632
  return last_modified
3556
3633
 
3634
+ def _get_owner_identifier(self, owner_dict: dict) -> Optional[str]:
3635
+ """Extract owner identifier (email or username) based on configuration."""
3636
+ if not owner_dict:
3637
+ return None
3638
+
3639
+ if self.config.use_email_as_username:
3640
+ email = owner_dict.get(c.EMAIL)
3641
+ if email:
3642
+ return email
3643
+ # Fall back to username if email is not available
3644
+ self.report.num_email_fallback_to_username += 1
3645
+
3646
+ return owner_dict.get(c.USERNAME)
3647
+
3557
3648
  @lru_cache(maxsize=None)
3558
3649
  def _get_ownership(self, user: str) -> Optional[OwnershipClass]:
3559
3650
  if self.config.ingest_owner and user:
@@ -3632,7 +3723,7 @@ class TableauSiteSource:
3632
3723
  container_key=project_key,
3633
3724
  name=project_.name,
3634
3725
  description=project_.description,
3635
- sub_types=[c.PROJECT],
3726
+ sub_types=[BIContainerSubTypes.TABLEAU_PROJECT],
3636
3727
  parent_container_key=parent_project_key,
3637
3728
  )
3638
3729
 
@@ -3650,7 +3741,7 @@ class TableauSiteSource:
3650
3741
  yield from gen_containers(
3651
3742
  container_key=self.gen_site_key(self.site_id),
3652
3743
  name=self.site.name or "Default",
3653
- sub_types=[c.SITE],
3744
+ sub_types=[BIContainerSubTypes.TABLEAU_SITE],
3654
3745
  )
3655
3746
 
3656
3747
  def _fetch_groups(self):
@@ -3777,3 +3868,15 @@ class TableauSiteSource:
3777
3868
  self.report.emit_upstream_tables_timer[self.site_content_url] = (
3778
3869
  timer.elapsed_seconds(digits=2)
3779
3870
  )
3871
+
3872
+ # Log owner extraction statistics if there were fallbacks
3873
+ if (
3874
+ self.config.use_email_as_username
3875
+ and self.config.ingest_owner
3876
+ and self.report.num_email_fallback_to_username > 0
3877
+ ):
3878
+ logger.info(
3879
+ f"Owner extraction summary for site '{self.site_content_url}': "
3880
+ f"{self.report.num_email_fallback_to_username} entities fell back from email to username "
3881
+ f"(email was not available)"
3882
+ )
@@ -65,6 +65,7 @@ workbook_graphql_query = """
65
65
  projectName
66
66
  owner {
67
67
  username
68
+ email
68
69
  }
69
70
  description
70
71
  uri
@@ -107,6 +108,7 @@ sheet_graphql_query = """
107
108
  luid
108
109
  owner {
109
110
  username
111
+ email
110
112
  }
111
113
  }
112
114
  datasourceFields {
@@ -185,6 +187,7 @@ dashboard_graphql_query = """
185
187
  luid
186
188
  owner {
187
189
  username
190
+ email
188
191
  }
189
192
  }
190
193
  }
@@ -268,6 +271,7 @@ embedded_datasource_graphql_query = """
268
271
  luid
269
272
  owner {
270
273
  username
274
+ email
271
275
  }
272
276
  }
273
277
  }
@@ -424,6 +428,7 @@ published_datasource_graphql_query = """
424
428
  }
425
429
  owner {
426
430
  username
431
+ email
427
432
  }
428
433
  description
429
434
  uri
@@ -579,10 +584,12 @@ def get_platform(connection_type: str) -> str:
579
584
  platform = "oracle"
580
585
  elif connection_type in ("tbio", "teradata"):
581
586
  platform = "teradata"
582
- elif connection_type in ("sqlserver"):
587
+ elif connection_type in ("sqlserver",):
583
588
  platform = "mssql"
584
- elif connection_type in ("athena"):
589
+ elif connection_type in ("athena",):
585
590
  platform = "athena"
591
+ elif connection_type in ("googlebigquery",):
592
+ platform = "bigquery"
586
593
  elif connection_type.endswith("_jdbc"):
587
594
  # e.g. convert trino_jdbc -> trino
588
595
  platform = connection_type[: -len("_jdbc")]
@@ -774,7 +781,7 @@ def get_overridden_info(
774
781
  if (
775
782
  lineage_overrides is not None
776
783
  and lineage_overrides.platform_override_map is not None
777
- and original_platform in lineage_overrides.platform_override_map.keys()
784
+ and original_platform in lineage_overrides.platform_override_map
778
785
  ):
779
786
  platform = lineage_overrides.platform_override_map[original_platform]
780
787
 
@@ -782,7 +789,7 @@ def get_overridden_info(
782
789
  lineage_overrides is not None
783
790
  and lineage_overrides.database_override_map is not None
784
791
  and upstream_db is not None
785
- and upstream_db in lineage_overrides.database_override_map.keys()
792
+ and upstream_db in lineage_overrides.database_override_map
786
793
  ):
787
794
  upstream_db = lineage_overrides.database_override_map[upstream_db]
788
795
 
@@ -50,7 +50,6 @@ TABLES = "tables"
50
50
  DESCRIPTION = "description"
51
51
  SQL = "SQL"
52
52
  QUERY = "query"
53
- SUB_TYPES = "subTypes"
54
53
  VIEW = "view"
55
54
  CUSTOM_SQL = "Custom SQL"
56
55
  REMOTE_TYPE = "remoteType"
@@ -58,9 +57,9 @@ UNKNOWN = "UNKNOWN"
58
57
  PUBLISHED_DATA_SOURCE = "PublishedDatasource"
59
58
  LUID = "luid"
60
59
  EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
61
- UPSTREAM_LINEAGE = "upstreamLineage"
62
60
  OWNER = "owner"
63
61
  USERNAME = "username"
62
+ EMAIL = "email"
64
63
  HAS_EXTRACTS = "hasExtracts"
65
64
  EXTRACT_LAST_REFRESH_TIME = "extractLastRefreshTime"
66
65
  EXTRACT_LAST_INCREMENTAL_UPDATE_TIME = "extractLastIncrementalUpdateTime"
@@ -78,8 +77,6 @@ CHART = "chart"
78
77
  DASHBOARD = "dashboard"
79
78
  DASHBOARDS_CONNECTION = "dashboardsConnection"
80
79
  EMBEDDED_DATA_SOURCES_CONNECTION = "embeddedDatasourcesConnection"
81
- PROJECT = "Project"
82
- SITE = "Site"
83
80
  IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql"
84
81
  SITE_PERMISSION = "sitePermission"
85
82
  ROLE_SITE_ADMIN_EXPLORER = "SiteAdministratorExplorer"