acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,30 @@
1
+ import logging
2
+ from typing import Optional
3
+
4
+ from datahub.api.entities.external.external_entities import (
5
+ PlatformResourceRepository,
6
+ )
7
+ from datahub.ingestion.graph.client import DataHubGraph
8
+ from datahub.ingestion.source.aws.tag_entities import (
9
+ LakeFormationTagPlatformResource,
10
+ LakeFormationTagPlatformResourceId,
11
+ )
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class GluePlatformResourceRepository(
17
+ PlatformResourceRepository[
18
+ LakeFormationTagPlatformResourceId, LakeFormationTagPlatformResource
19
+ ]
20
+ ):
21
+ """AWS Glue-specific platform resource repository with tag-related operations."""
22
+
23
+ def __init__(
24
+ self,
25
+ graph: DataHubGraph,
26
+ platform_instance: Optional[str] = None,
27
+ catalog: Optional[str] = None,
28
+ ):
29
+ super().__init__(graph, platform_instance)
30
+ self.catalog = catalog
@@ -1,5 +1,6 @@
1
1
  import logging
2
- from typing import Iterable, Optional, Union
2
+ from dataclasses import dataclass
3
+ from typing import TYPE_CHECKING, Iterable, Optional, Union
3
4
 
4
5
  from datahub.emitter.mce_builder import make_tag_urn
5
6
  from datahub.ingestion.api.common import PipelineContext
@@ -11,9 +12,14 @@ from datahub.ingestion.source.aws.s3_util import (
11
12
  )
12
13
  from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass
13
14
 
15
+ if TYPE_CHECKING:
16
+ from mypy_boto3_s3.service_resource import ObjectSummary
17
+
14
18
  logging.getLogger("py4j").setLevel(logging.ERROR)
15
19
  logger: logging.Logger = logging.getLogger(__name__)
16
20
 
21
+ LIST_OBJECTS_PAGE_SIZE = 1000
22
+
17
23
 
18
24
  def get_s3_tags(
19
25
  bucket_name: str,
@@ -74,16 +80,82 @@ def get_s3_tags(
74
80
  return new_tags
75
81
 
76
82
 
83
+ @dataclass
84
+ class DirEntry:
85
+ """
86
+ Intended to be similar to os.DirEntry, which contains a name, full path, and possibly
87
+ other attributes of a directory entry. Currently only used to represent S3 folder-like
88
+ paths.
89
+ """
90
+
91
+ name: str
92
+ path: str
93
+
94
+
77
95
  def list_folders_path(
78
- s3_uri: str, aws_config: Optional[AwsConnectionConfig]
79
- ) -> Iterable[str]:
96
+ s3_uri: str,
97
+ *,
98
+ startswith: str = "",
99
+ aws_config: Optional[AwsConnectionConfig] = None,
100
+ ) -> Iterable[DirEntry]:
101
+ """
102
+ Given an S3 URI to a folder or bucket, return all sub-folders underneath that URI,
103
+ optionally filtering by startswith. Returned entries never contain a trailing slash.
104
+ """
105
+
106
+ if not is_s3_uri(s3_uri):
107
+ raise ValueError("Not a s3 URI: " + s3_uri)
108
+ if aws_config is None:
109
+ raise ValueError("aws_config not set. Cannot browse s3")
110
+
111
+ if not s3_uri.endswith("/"):
112
+ s3_uri += "/"
113
+
114
+ bucket_name = get_bucket_name(s3_uri)
115
+ if not bucket_name:
116
+ # No bucket name means we only have the s3[an]:// protocol, not a full bucket and
117
+ # prefix.
118
+ for folder in list_buckets(startswith, aws_config):
119
+ yield DirEntry(name=folder, path=f"{s3_uri}{folder}")
120
+ return
121
+
122
+ prefix = get_bucket_relative_path(s3_uri) + startswith
123
+ for folder in list_folders(bucket_name, prefix, aws_config):
124
+ folder = folder.removesuffix("/").split("/")[-1]
125
+ yield DirEntry(name=folder, path=f"{s3_uri}{folder}")
126
+
127
+
128
+ def list_objects_recursive_path(
129
+ s3_uri: str,
130
+ *,
131
+ startswith: str = "",
132
+ aws_config: Optional[AwsConnectionConfig] = None,
133
+ ) -> Iterable["ObjectSummary"]:
134
+ """
135
+ Given an S3 URI to a folder or bucket, return all objects underneath that URI, optionally
136
+ filtering by startswith.
137
+ """
138
+
80
139
  if not is_s3_uri(s3_uri):
81
140
  raise ValueError("Not a s3 URI: " + s3_uri)
82
141
  if aws_config is None:
83
142
  raise ValueError("aws_config not set. Cannot browse s3")
143
+ if startswith and "/" in startswith:
144
+ raise ValueError(f"startswith contains forward slash: {repr(startswith)}")
145
+
146
+ if not s3_uri.endswith("/"):
147
+ s3_uri += "/"
148
+
84
149
  bucket_name = get_bucket_name(s3_uri)
85
- prefix = get_bucket_relative_path(s3_uri)
86
- yield from list_folders(bucket_name, prefix, aws_config)
150
+ if not bucket_name:
151
+ # No bucket name means we only have the s3[an]:// protocol, not a full bucket and
152
+ # prefix.
153
+ for bucket_name in list_buckets(startswith, aws_config):
154
+ yield from list_objects_recursive(bucket_name, "", aws_config)
155
+ return
156
+
157
+ prefix = get_bucket_relative_path(s3_uri) + startswith
158
+ yield from list_objects_recursive(bucket_name, prefix, aws_config)
87
159
 
88
160
 
89
161
  def list_folders(
@@ -99,3 +171,26 @@ def list_folders(
99
171
  if folder.endswith("/"):
100
172
  folder = folder[:-1]
101
173
  yield f"{folder}"
174
+
175
+
176
+ def list_buckets(
177
+ prefix: str, aws_config: Optional[AwsConnectionConfig]
178
+ ) -> Iterable[str]:
179
+ if aws_config is None:
180
+ raise ValueError("aws_config not set. Cannot browse s3")
181
+ s3_client = aws_config.get_s3_client()
182
+ paginator = s3_client.get_paginator("list_buckets")
183
+ for page in paginator.paginate(Prefix=prefix):
184
+ for o in page.get("Buckets", []):
185
+ yield str(o.get("Name"))
186
+
187
+
188
+ def list_objects_recursive(
189
+ bucket_name: str, prefix: str, aws_config: Optional[AwsConnectionConfig]
190
+ ) -> Iterable["ObjectSummary"]:
191
+ if aws_config is None:
192
+ raise ValueError("aws_config not set. Cannot browse s3")
193
+ s3_resource = aws_config.get_s3_resource()
194
+ bucket = s3_resource.Bucket(bucket_name)
195
+ for obj in bucket.objects.filter(Prefix=prefix).page_size(LIST_OBJECTS_PAGE_SIZE):
196
+ yield obj
@@ -0,0 +1,270 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ from datahub.ingestion.source.aws.platform_resource_repository import (
6
+ GluePlatformResourceRepository,
7
+ )
8
+
9
+ from pydantic import BaseModel
10
+
11
+ from datahub.api.entities.external.external_entities import (
12
+ ExternalEntity,
13
+ ExternalEntityId,
14
+ LinkedResourceSet,
15
+ )
16
+ from datahub.api.entities.external.lake_formation_external_entites import (
17
+ LakeFormationTag,
18
+ )
19
+ from datahub.api.entities.platformresource.platform_resource import (
20
+ PlatformResource,
21
+ PlatformResourceKey,
22
+ )
23
+ from datahub.metadata.urns import TagUrn
24
+ from datahub.utilities.urns.urn import Urn
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class LakeFormationTagSyncContext(BaseModel):
30
+ # it is intentionally empty
31
+ platform_instance: Optional[str] = None
32
+ catalog: Optional[str] = None
33
+
34
+ # Making it compatible with SyncContext interface
35
+ def get_platform_instance(self) -> Optional[str]:
36
+ return self.platform_instance
37
+
38
+
39
+ class LakeFormationTagPlatformResourceId(ExternalEntityId):
40
+ """
41
+ A LakeFormationTag is a unique identifier for a Lakeformation tag.
42
+ """
43
+
44
+ tag_key: str
45
+ tag_value: Optional[str] = None
46
+ platform_instance: Optional[str] = None
47
+ catalog: Optional[str] = None
48
+ exists_in_lake_formation: bool = False
49
+ persisted: bool = False
50
+
51
+ # this is a hack to make sure the property is a string and not private pydantic field
52
+ @staticmethod
53
+ def _RESOURCE_TYPE() -> str:
54
+ return "LakeFormationTagPlatformResource"
55
+
56
+ def to_platform_resource_key(self) -> PlatformResourceKey:
57
+ return PlatformResourceKey(
58
+ platform="glue",
59
+ resource_type=str(LakeFormationTagPlatformResourceId._RESOURCE_TYPE()),
60
+ primary_key=f"{self.catalog}.{self.tag_key}:{self.tag_value}"
61
+ if self.catalog
62
+ else f"{self.tag_key}:{self.tag_value}",
63
+ platform_instance=self.platform_instance,
64
+ )
65
+
66
+ @classmethod
67
+ def get_or_create_from_tag(
68
+ cls,
69
+ tag: LakeFormationTag,
70
+ platform_resource_repository: "GluePlatformResourceRepository",
71
+ exists_in_lake_formation: bool = False,
72
+ catalog_id: Optional[str] = None,
73
+ ) -> "LakeFormationTagPlatformResourceId":
74
+ """
75
+ Creates a LakeFormationTagPlatformResourceId from a LakeFormationTag.
76
+ """
77
+
78
+ # Use catalog_id if provided, otherwise fall back to repository catalog
79
+ effective_catalog = catalog_id or platform_resource_repository.catalog
80
+
81
+ existing_platform_resource = cls.search_by_urn(
82
+ tag.to_datahub_tag_urn().urn(),
83
+ platform_resource_repository=platform_resource_repository,
84
+ tag_sync_context=LakeFormationTagSyncContext(
85
+ platform_instance=platform_resource_repository.platform_instance,
86
+ catalog=effective_catalog,
87
+ ),
88
+ )
89
+ if existing_platform_resource:
90
+ logger.info(
91
+ f"Found existing LakeFormationTagPlatformResourceId for tag {tag.key}: {existing_platform_resource}"
92
+ )
93
+ return existing_platform_resource
94
+
95
+ return LakeFormationTagPlatformResourceId(
96
+ tag_key=str(tag.key),
97
+ tag_value=str(tag.value) if tag.value is not None else None,
98
+ platform_instance=platform_resource_repository.platform_instance,
99
+ catalog=effective_catalog,
100
+ exists_in_lake_formation=exists_in_lake_formation,
101
+ persisted=False,
102
+ )
103
+
104
+ @classmethod
105
+ def search_by_urn(
106
+ cls,
107
+ urn: str,
108
+ platform_resource_repository: "GluePlatformResourceRepository",
109
+ tag_sync_context: LakeFormationTagSyncContext,
110
+ ) -> Optional["LakeFormationTagPlatformResourceId"]:
111
+ """
112
+ Search for existing Lake Formation tag entity by URN using repository caching.
113
+
114
+ This method now delegates to the repository's search_entity_by_urn method to ensure
115
+ consistent caching behavior across all platform implementations.
116
+ """
117
+ # Use repository's cached search method instead of duplicating search logic
118
+ existing_entity_id = platform_resource_repository.search_entity_by_urn(urn)
119
+
120
+ if existing_entity_id:
121
+ # Verify platform instance and catalog match
122
+ if (
123
+ existing_entity_id.platform_instance
124
+ == tag_sync_context.platform_instance
125
+ and existing_entity_id.catalog == tag_sync_context.catalog
126
+ ):
127
+ logger.info(
128
+ f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_entity_id}"
129
+ )
130
+ # Create a new ID with the correct state instead of mutating
131
+ return LakeFormationTagPlatformResourceId(
132
+ tag_key=existing_entity_id.tag_key,
133
+ tag_value=existing_entity_id.tag_value,
134
+ platform_instance=existing_entity_id.platform_instance,
135
+ catalog=existing_entity_id.catalog,
136
+ exists_in_lake_formation=True, # This tag exists in Lake Formation
137
+ persisted=True, # And it's persisted in DataHub
138
+ )
139
+
140
+ logger.info(
141
+ f"No mapped tag found for URN {urn} with platform instance {tag_sync_context.platform_instance}. Creating a new LakeFormationTagPlatformResourceId."
142
+ )
143
+ return None
144
+
145
+ @classmethod
146
+ def from_datahub_urn(
147
+ cls,
148
+ urn: str,
149
+ platform_resource_repository: "GluePlatformResourceRepository",
150
+ tag_sync_context: LakeFormationTagSyncContext,
151
+ ) -> "LakeFormationTagPlatformResourceId":
152
+ """
153
+ Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
154
+ """
155
+ # First we check if we already have a mapped platform resource for this
156
+ # urn that is of the type UnityCatalogTagPlatformResource
157
+ # If we do, we can use it to create the UnityCatalogTagPlatformResourceId
158
+ # Else, we need to generate a new UnityCatalogTagPlatformResourceId
159
+ existing_platform_resource_id = cls.search_by_urn(
160
+ urn, platform_resource_repository, tag_sync_context
161
+ )
162
+ if existing_platform_resource_id:
163
+ logger.info(
164
+ f"Found existing LakeFormationTagPlatformResourceId for URN {urn}: {existing_platform_resource_id}"
165
+ )
166
+ return existing_platform_resource_id
167
+
168
+ # Otherwise, we need to create a new UnityCatalogTagPlatformResourceId
169
+ new_tag_id = cls.generate_tag_id(tag_sync_context, urn)
170
+ if new_tag_id:
171
+ # we then check if this tag has already been ingested as a platform
172
+ # resource in the platform resource repository
173
+ resource_key = platform_resource_repository.get(
174
+ new_tag_id.to_platform_resource_key()
175
+ )
176
+ if resource_key:
177
+ logger.info(
178
+ f"Tag {new_tag_id} already exists in platform resource repository with {resource_key}"
179
+ )
180
+ # Create a new ID with the correct state instead of mutating
181
+ return LakeFormationTagPlatformResourceId(
182
+ tag_key=new_tag_id.tag_key,
183
+ tag_value=new_tag_id.tag_value,
184
+ platform_instance=new_tag_id.platform_instance,
185
+ catalog=new_tag_id.catalog,
186
+ exists_in_lake_formation=True, # This tag exists in Lake Formation
187
+ persisted=new_tag_id.persisted,
188
+ )
189
+ return new_tag_id
190
+ raise ValueError(f"Unable to create LakeFormationTagId from DataHub URN: {urn}")
191
+
192
+ @classmethod
193
+ def generate_tag_id(
194
+ cls, tag_sync_context: LakeFormationTagSyncContext, urn: str
195
+ ) -> "LakeFormationTagPlatformResourceId":
196
+ parsed_urn = Urn.from_string(urn)
197
+ entity_type = parsed_urn.entity_type
198
+ if entity_type == "tag":
199
+ new_tag_id = LakeFormationTagPlatformResourceId.from_datahub_tag(
200
+ TagUrn.from_string(urn), tag_sync_context
201
+ )
202
+ else:
203
+ raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
204
+ return new_tag_id
205
+
206
+ @classmethod
207
+ def from_datahub_tag(
208
+ cls, tag_urn: TagUrn, tag_sync_context: LakeFormationTagSyncContext
209
+ ) -> "LakeFormationTagPlatformResourceId":
210
+ tag = LakeFormationTag.from_urn(tag_urn)
211
+
212
+ return LakeFormationTagPlatformResourceId(
213
+ tag_key=str(tag.key),
214
+ tag_value=str(tag.value),
215
+ platform_instance=tag_sync_context.platform_instance,
216
+ catalog=tag_sync_context.catalog,
217
+ exists_in_lake_formation=False,
218
+ )
219
+
220
+
221
+ class LakeFormationTagPlatformResource(ExternalEntity):
222
+ datahub_urns: LinkedResourceSet
223
+ managed_by_datahub: bool
224
+ id: LakeFormationTagPlatformResourceId
225
+ allowed_values: Optional[List[str]] = None
226
+
227
+ def get_id(self) -> ExternalEntityId:
228
+ return self.id
229
+
230
+ def is_managed_by_datahub(self) -> bool:
231
+ return self.managed_by_datahub
232
+
233
+ def datahub_linked_resources(self) -> LinkedResourceSet:
234
+ return self.datahub_urns
235
+
236
+ def as_platform_resource(self) -> PlatformResource:
237
+ return PlatformResource.create(
238
+ key=self.id.to_platform_resource_key(),
239
+ secondary_keys=[u for u in self.datahub_urns.urns],
240
+ value=self,
241
+ )
242
+
243
+ @classmethod
244
+ def create_default(
245
+ cls,
246
+ entity_id: ExternalEntityId,
247
+ managed_by_datahub: bool,
248
+ ) -> "LakeFormationTagPlatformResource":
249
+ """Create a default Lake Formation tag entity when none found in DataHub."""
250
+ # Type narrowing: we know this will be a LakeFormationTagPlatformResourceId
251
+ assert isinstance(entity_id, LakeFormationTagPlatformResourceId), (
252
+ f"Expected LakeFormationTagPlatformResourceId, got {type(entity_id)}"
253
+ )
254
+
255
+ # Create a new entity ID with correct default state instead of mutating
256
+ default_entity_id = LakeFormationTagPlatformResourceId(
257
+ tag_key=entity_id.tag_key,
258
+ tag_value=entity_id.tag_value,
259
+ platform_instance=entity_id.platform_instance,
260
+ catalog=entity_id.catalog,
261
+ exists_in_lake_formation=False, # New entities don't exist in Lake Formation yet
262
+ persisted=False, # New entities are not persisted yet
263
+ )
264
+
265
+ return cls(
266
+ id=default_entity_id,
267
+ datahub_urns=LinkedResourceSet(urns=[]),
268
+ managed_by_datahub=managed_by_datahub,
269
+ allowed_values=None,
270
+ )
@@ -61,13 +61,13 @@ class AzureConnectionConfig(ConfigModel):
61
61
  def get_blob_service_client(self):
62
62
  return BlobServiceClient(
63
63
  account_url=f"https://{self.account_name}.blob.core.windows.net",
64
- credential=f"{self.get_credentials()}",
64
+ credential=self.get_credentials(),
65
65
  )
66
66
 
67
67
  def get_data_lake_service_client(self) -> DataLakeServiceClient:
68
68
  return DataLakeServiceClient(
69
69
  account_url=f"https://{self.account_name}.dfs.core.windows.net",
70
- credential=f"{self.get_credentials()}",
70
+ credential=self.get_credentials(),
71
71
  )
72
72
 
73
73
  def get_credentials(
@@ -81,7 +81,7 @@ class AzureConnectionConfig(ConfigModel):
81
81
  )
82
82
  return self.sas_token if self.sas_token is not None else self.account_key
83
83
 
84
- @root_validator()
84
+ @root_validator(skip_on_failure=True)
85
85
  def _check_credential_values(cls, values: Dict) -> Dict:
86
86
  if (
87
87
  values.get("account_key")
@@ -4,6 +4,7 @@ import logging
4
4
  import os
5
5
  from typing import Iterable, List, Optional
6
6
 
7
+ from datahub.configuration.common import AllowDenyPattern
7
8
  from datahub.ingestion.api.common import PipelineContext
8
9
  from datahub.ingestion.api.decorators import (
9
10
  SupportStatus,
@@ -44,9 +45,11 @@ from datahub.ingestion.source.bigquery_v2.queries_extractor import (
44
45
  BigQueryQueriesExtractorConfig,
45
46
  )
46
47
  from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor
48
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
47
49
  from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
48
50
  from datahub.ingestion.source.state.redundant_run_skip_handler import (
49
51
  RedundantLineageRunSkipHandler,
52
+ RedundantQueriesRunSkipHandler,
50
53
  RedundantUsageRunSkipHandler,
51
54
  )
52
55
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -77,7 +80,14 @@ def cleanup(config: BigQueryV2Config) -> None:
77
80
  supported=False,
78
81
  )
79
82
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
80
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
83
+ @capability(
84
+ SourceCapability.CONTAINERS,
85
+ "Enabled by default",
86
+ subtype_modifier=[
87
+ SourceCapabilityModifier.BIGQUERY_PROJECT,
88
+ SourceCapabilityModifier.BIGQUERY_DATASET,
89
+ ],
90
+ )
81
91
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
82
92
  @capability(
83
93
  SourceCapability.DATA_PROFILING,
@@ -99,6 +109,7 @@ def cleanup(config: BigQueryV2Config) -> None:
99
109
  SourceCapability.PARTITION_SUPPORT,
100
110
  "Enabled by default, partition keys and clustering keys are supported.",
101
111
  )
112
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
102
113
  class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
103
114
  def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
104
115
  super().__init__(config, ctx)
@@ -135,7 +146,10 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
135
146
  redundant_lineage_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = (
136
147
  None
137
148
  )
138
- if self.config.enable_stateful_lineage_ingestion:
149
+ if (
150
+ self.config.enable_stateful_lineage_ingestion
151
+ and not self.config.use_queries_v2
152
+ ):
139
153
  redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
140
154
  source=self,
141
155
  config=self.config,
@@ -241,7 +255,23 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
241
255
  ).workunit_processor,
242
256
  ]
243
257
 
258
+ def _warn_deprecated_configs(self):
259
+ if (
260
+ self.config.match_fully_qualified_names is not None
261
+ and not self.config.match_fully_qualified_names
262
+ and self.config.schema_pattern is not None
263
+ and self.config.schema_pattern != AllowDenyPattern.allow_all()
264
+ ):
265
+ self.report.report_warning(
266
+ message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
267
+ "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
268
+ "The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
269
+ context="Config option deprecation warning",
270
+ title="Config option deprecation warning",
271
+ )
272
+
244
273
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
274
+ self._warn_deprecated_configs()
245
275
  projects = get_projects(
246
276
  self.bq_schema_extractor.schema_api,
247
277
  self.report,
@@ -270,28 +300,41 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
270
300
  ):
271
301
  return
272
302
 
273
- with self.report.new_stage(
274
- f"*: {QUERIES_EXTRACTION}"
275
- ), BigQueryQueriesExtractor(
276
- connection=self.config.get_bigquery_client(),
277
- schema_api=self.bq_schema_extractor.schema_api,
278
- config=BigQueryQueriesExtractorConfig(
279
- window=self.config,
280
- user_email_pattern=self.config.usage.user_email_pattern,
281
- include_lineage=self.config.include_table_lineage,
282
- include_usage_statistics=self.config.include_usage_statistics,
283
- include_operations=self.config.usage.include_operational_stats,
284
- include_queries=self.config.include_queries,
285
- include_query_usage_statistics=self.config.include_query_usage_statistics,
286
- top_n_queries=self.config.usage.top_n_queries,
287
- region_qualifiers=self.config.region_qualifiers,
288
- ),
289
- structured_report=self.report,
290
- filters=self.filters,
291
- identifiers=self.identifiers,
292
- schema_resolver=self.sql_parser_schema_resolver,
293
- discovered_tables=self.bq_schema_extractor.table_refs,
294
- ) as queries_extractor:
303
+ redundant_queries_run_skip_handler: Optional[
304
+ RedundantQueriesRunSkipHandler
305
+ ] = None
306
+ if self.config.enable_stateful_time_window:
307
+ redundant_queries_run_skip_handler = RedundantQueriesRunSkipHandler(
308
+ source=self,
309
+ config=self.config,
310
+ pipeline_name=self.ctx.pipeline_name,
311
+ run_id=self.ctx.run_id,
312
+ )
313
+
314
+ with (
315
+ self.report.new_stage(f"*: {QUERIES_EXTRACTION}"),
316
+ BigQueryQueriesExtractor(
317
+ connection=self.config.get_bigquery_client(),
318
+ schema_api=self.bq_schema_extractor.schema_api,
319
+ config=BigQueryQueriesExtractorConfig(
320
+ window=self.config,
321
+ user_email_pattern=self.config.usage.user_email_pattern,
322
+ include_lineage=self.config.include_table_lineage,
323
+ include_usage_statistics=self.config.include_usage_statistics,
324
+ include_operations=self.config.usage.include_operational_stats,
325
+ include_queries=self.config.include_queries,
326
+ include_query_usage_statistics=self.config.include_query_usage_statistics,
327
+ top_n_queries=self.config.usage.top_n_queries,
328
+ region_qualifiers=self.config.region_qualifiers,
329
+ ),
330
+ structured_report=self.report,
331
+ filters=self.filters,
332
+ identifiers=self.identifiers,
333
+ redundant_run_skip_handler=redundant_queries_run_skip_handler,
334
+ schema_resolver=self.sql_parser_schema_resolver,
335
+ discovered_tables=self.bq_schema_extractor.table_refs,
336
+ ) as queries_extractor,
337
+ ):
295
338
  self.report.queries_extractor = queries_extractor.report
296
339
  yield from queries_extractor.get_workunits_internal()
297
340
  else: