acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,13 @@
1
1
  import logging
2
- import os
3
2
  import re
3
+ from copy import deepcopy
4
4
  from datetime import timedelta
5
- from typing import Any, Dict, List, Optional, Union
5
+ from typing import Dict, List, Optional, Union
6
6
 
7
- from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
8
- from google.cloud.logging_v2.client import Client as GCPLoggingClient
9
7
  from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
10
8
 
11
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
9
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
10
+ from datahub.configuration.env_vars import get_bigquery_schema_parallelism
12
11
  from datahub.configuration.source_common import (
13
12
  EnvConfigMixin,
14
13
  LowerCaseDatasetUrnConfigMixin,
@@ -18,21 +17,22 @@ from datahub.configuration.validate_field_removal import pydantic_removed_field
18
17
  from datahub.ingestion.glossary.classification_mixin import (
19
18
  ClassificationSourceConfigMixin,
20
19
  )
21
- from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
20
+ from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
21
+ BigQueryConnectionConfig,
22
+ )
22
23
  from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
23
24
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterConfig
24
25
  from datahub.ingestion.source.state.stateful_ingestion_base import (
25
26
  StatefulLineageConfigMixin,
26
27
  StatefulProfilingConfigMixin,
28
+ StatefulTimeWindowConfigMixin,
27
29
  StatefulUsageConfigMixin,
28
30
  )
29
31
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
30
32
 
31
33
  logger = logging.getLogger(__name__)
32
34
 
33
- DEFAULT_BQ_SCHEMA_PARALLELISM = int(
34
- os.getenv("DATAHUB_BIGQUERY_SCHEMA_PARALLELISM", 20)
35
- )
35
+ DEFAULT_BQ_SCHEMA_PARALLELISM = get_bigquery_schema_parallelism()
36
36
 
37
37
  # Regexp for sharded tables.
38
38
  # A sharded table is a table that has a suffix of the form _yyyymmdd or yyyymmdd, where yyyymmdd is a date.
@@ -73,8 +73,10 @@ class BigQueryBaseConfig(ConfigModel):
73
73
  ) from e
74
74
  return v
75
75
 
76
- @root_validator(pre=True, skip_on_failure=True)
76
+ @root_validator(pre=True)
77
77
  def project_id_backward_compatibility_configs_set(cls, values: Dict) -> Dict:
78
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
79
+ values = deepcopy(values)
78
80
  project_id = values.pop("project_id", None)
79
81
  project_ids = values.get("project_ids")
80
82
 
@@ -105,64 +107,6 @@ class BigQueryUsageConfig(BaseUsageConfig):
105
107
  )
106
108
 
107
109
 
108
- class BigQueryConnectionConfig(ConfigModel):
109
- credential: Optional[GCPCredential] = Field(
110
- default=None, description="BigQuery credential informations"
111
- )
112
-
113
- _credentials_path: Optional[str] = PrivateAttr(None)
114
-
115
- extra_client_options: Dict[str, Any] = Field(
116
- default={},
117
- description="Additional options to pass to google.cloud.logging_v2.client.Client.",
118
- )
119
-
120
- project_on_behalf: Optional[str] = Field(
121
- default=None,
122
- description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
123
- )
124
-
125
- def __init__(self, **data: Any):
126
- super().__init__(**data)
127
-
128
- if self.credential:
129
- self._credentials_path = self.credential.create_credential_temp_file()
130
- logger.debug(
131
- f"Creating temporary credential file at {self._credentials_path}"
132
- )
133
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
134
-
135
- def get_bigquery_client(self) -> bigquery.Client:
136
- client_options = self.extra_client_options
137
- return bigquery.Client(self.project_on_behalf, **client_options)
138
-
139
- def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
140
- return resourcemanager_v3.ProjectsClient()
141
-
142
- def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
143
- return datacatalog_v1.PolicyTagManagerClient()
144
-
145
- def make_gcp_logging_client(
146
- self, project_id: Optional[str] = None
147
- ) -> GCPLoggingClient:
148
- # See https://github.com/googleapis/google-cloud-python/issues/2674 for
149
- # why we disable gRPC here.
150
- client_options = self.extra_client_options.copy()
151
- client_options["_use_grpc"] = False
152
- if project_id is not None:
153
- return GCPLoggingClient(**client_options, project=project_id)
154
- else:
155
- return GCPLoggingClient(**client_options)
156
-
157
- def get_sql_alchemy_url(self) -> str:
158
- if self.project_on_behalf:
159
- return f"bigquery://{self.project_on_behalf}"
160
- # When project_id is not set, we will attempt to detect the project ID
161
- # based on the credentials or environment variables.
162
- # See https://github.com/mxmzdlv/pybigquery#authentication.
163
- return "bigquery://"
164
-
165
-
166
110
  class GcsLineageProviderConfig(ConfigModel):
167
111
  """
168
112
  Any source that produces gcs lineage from/to Datasets should inherit this class.
@@ -240,13 +184,14 @@ class BigQueryFilterConfig(SQLFilterConfig):
240
184
  )
241
185
 
242
186
  # NOTE: `schema_pattern` is added here only to hide it from docs.
243
- schema_pattern: AllowDenyPattern = Field(
187
+ schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
244
188
  default=AllowDenyPattern.allow_all(),
245
- hidden_from_docs=True,
246
189
  )
247
190
 
248
191
  @root_validator(pre=False, skip_on_failure=True)
249
192
  def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
193
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
194
+ values = deepcopy(values)
250
195
  dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
251
196
  schema_pattern = values.get("schema_pattern")
252
197
  if (
@@ -327,6 +272,7 @@ class BigQueryV2Config(
327
272
  SQLCommonConfig,
328
273
  StatefulUsageConfigMixin,
329
274
  StatefulLineageConfigMixin,
275
+ StatefulTimeWindowConfigMixin,
330
276
  StatefulProfilingConfigMixin,
331
277
  ClassificationSourceConfigMixin,
332
278
  ):
@@ -378,8 +324,7 @@ class BigQueryV2Config(
378
324
  description="Include full payload into events. It is only for debugging and internal use.",
379
325
  )
380
326
 
381
- number_of_datasets_process_in_batch: int = Field(
382
- hidden_from_docs=True,
327
+ number_of_datasets_process_in_batch: HiddenFromDocs[int] = Field(
383
328
  default=10000,
384
329
  description="Number of table queried in batch when getting metadata. This is a low level config property "
385
330
  "which should be touched with care.",
@@ -400,7 +345,7 @@ class BigQueryV2Config(
400
345
  )
401
346
 
402
347
  use_queries_v2: bool = Field(
403
- default=False,
348
+ default=True,
404
349
  description="If enabled, uses the new queries extractor to extract queries from bigquery.",
405
350
  )
406
351
  include_queries: bool = Field(
@@ -494,17 +439,15 @@ class BigQueryV2Config(
494
439
 
495
440
  upstream_lineage_in_report: bool = Field(
496
441
  default=False,
497
- description="Useful for debugging lineage information. Set to True to see the raw lineage created internally.",
442
+ description="Useful for debugging lineage information. Set to True to see the raw lineage created internally. Only works with legacy approach (`use_queries_v2: False`).",
498
443
  )
499
444
 
500
- run_optimized_column_query: bool = Field(
501
- hidden_from_docs=True,
445
+ run_optimized_column_query: HiddenFromDocs[bool] = Field(
502
446
  default=False,
503
447
  description="Run optimized column query to get column information. This is an experimental feature and may not work for all cases.",
504
448
  )
505
449
 
506
- file_backed_cache_size: int = Field(
507
- hidden_from_docs=True,
450
+ file_backed_cache_size: HiddenFromDocs[int] = Field(
508
451
  default=2000,
509
452
  description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
510
453
  )
@@ -514,10 +457,9 @@ class BigQueryV2Config(
514
457
  description="Option to exclude empty projects from being ingested.",
515
458
  )
516
459
 
517
- schema_resolution_batch_size: int = Field(
460
+ schema_resolution_batch_size: HiddenFromDocs[int] = Field(
518
461
  default=100,
519
462
  description="The number of tables to process in a batch when resolving schema from DataHub.",
520
- hidden_from_schema=True,
521
463
  )
522
464
 
523
465
  max_threads_dataset_parallelism: int = Field(
@@ -538,6 +480,8 @@ class BigQueryV2Config(
538
480
 
539
481
  @root_validator(pre=True)
540
482
  def set_include_schema_metadata(cls, values: Dict) -> Dict:
483
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
484
+ values = deepcopy(values)
541
485
  # Historically this is used to disable schema ingestion
542
486
  if (
543
487
  "include_tables" in values
@@ -556,6 +500,8 @@ class BigQueryV2Config(
556
500
 
557
501
  @root_validator(skip_on_failure=True)
558
502
  def profile_default_settings(cls, values: Dict) -> Dict:
503
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
504
+ values = deepcopy(values)
559
505
  # Extra default SQLAlchemy option for better connection pooling and threading.
560
506
  # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
561
507
  values["options"].setdefault("max_overflow", -1)
@@ -573,9 +519,33 @@ class BigQueryV2Config(
573
519
 
574
520
  return v
575
521
 
522
+ @validator("upstream_lineage_in_report")
523
+ def validate_upstream_lineage_in_report(cls, v: bool, values: Dict) -> bool:
524
+ if v and values.get("use_queries_v2", True):
525
+ logging.warning(
526
+ "`upstream_lineage_in_report` is enabled but will be ignored because `use_queries_v2` is enabled."
527
+ "This debugging feature only works with the legacy lineage approach (`use_queries_v2: false`)."
528
+ )
529
+
530
+ return v
531
+
532
+ @root_validator(pre=False, skip_on_failure=True)
533
+ def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
534
+ if values.get("use_queries_v2"):
535
+ if values.get("enable_stateful_lineage_ingestion") or values.get(
536
+ "enable_stateful_usage_ingestion"
537
+ ):
538
+ logger.warning(
539
+ "enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
540
+ "when using use_queries_v2=True. These configs only work with the legacy (non-queries v2) extraction path. "
541
+ "For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
542
+ "for the unified time window extraction (lineage + usage + operations + queries)."
543
+ )
544
+ return values
545
+
576
546
  def get_table_pattern(self, pattern: List[str]) -> str:
577
547
  return "|".join(pattern) if pattern else ""
578
548
 
579
- platform_instance_not_supported_for_bigquery = pydantic_removed_field(
549
+ _platform_instance_not_supported_for_bigquery = pydantic_removed_field(
580
550
  "platform_instance"
581
551
  )
@@ -0,0 +1,81 @@
1
+ import logging
2
+ import os
3
+ from typing import Any, Dict, Optional
4
+
5
+ from google.api_core.client_info import ClientInfo
6
+ from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
7
+ from google.cloud.logging_v2.client import Client as GCPLoggingClient
8
+ from pydantic import Field, PrivateAttr
9
+
10
+ from datahub._version import __version__
11
+ from datahub.configuration.common import ConfigModel
12
+ from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def _get_bigquery_client_info() -> ClientInfo:
18
+ """Get ClientInfo with DataHub user-agent for BigQuery client identification"""
19
+ return ClientInfo(user_agent=f"datahub/{__version__}")
20
+
21
+
22
+ class BigQueryConnectionConfig(ConfigModel):
23
+ credential: Optional[GCPCredential] = Field(
24
+ default=None, description="BigQuery credential informations"
25
+ )
26
+
27
+ _credentials_path: Optional[str] = PrivateAttr(None)
28
+
29
+ extra_client_options: Dict[str, Any] = Field(
30
+ default={},
31
+ description="Additional options to pass to google.cloud.logging_v2.client.Client.",
32
+ )
33
+
34
+ project_on_behalf: Optional[str] = Field(
35
+ default=None,
36
+ description="[Advanced] The BigQuery project in which queries are executed. Will be passed when creating a job. If not passed, falls back to the project associated with the service account.",
37
+ )
38
+
39
+ def __init__(self, **data: Any):
40
+ super().__init__(**data)
41
+
42
+ if self.credential:
43
+ self._credentials_path = self.credential.create_credential_temp_file()
44
+ logger.debug(
45
+ f"Creating temporary credential file at {self._credentials_path}"
46
+ )
47
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self._credentials_path
48
+
49
+ def get_bigquery_client(self) -> bigquery.Client:
50
+ client_options = self.extra_client_options
51
+ return bigquery.Client(
52
+ self.project_on_behalf,
53
+ client_info=_get_bigquery_client_info(),
54
+ **client_options,
55
+ )
56
+
57
+ def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
58
+ return resourcemanager_v3.ProjectsClient()
59
+
60
+ def get_policy_tag_manager_client(self) -> datacatalog_v1.PolicyTagManagerClient:
61
+ return datacatalog_v1.PolicyTagManagerClient()
62
+
63
+ def make_gcp_logging_client(
64
+ self, project_id: Optional[str] = None
65
+ ) -> GCPLoggingClient:
66
+ # See https://github.com/googleapis/google-cloud-python/issues/2674 for
67
+ # why we disable gRPC here.
68
+ client_options = self.extra_client_options.copy()
69
+ client_options["_use_grpc"] = False
70
+ if project_id is not None:
71
+ return GCPLoggingClient(**client_options, project=project_id)
72
+ else:
73
+ return GCPLoggingClient(**client_options)
74
+
75
+ def get_sql_alchemy_url(self) -> str:
76
+ if self.project_on_behalf:
77
+ return f"bigquery://{self.project_on_behalf}"
78
+ # When project_id is not set, we will attempt to detect the project ID
79
+ # based on the credentials or environment variables.
80
+ # See https://github.com/mxmzdlv/pybigquery#authentication.
81
+ return "bigquery://"
@@ -7,13 +7,16 @@ from typing_extensions import Self
7
7
 
8
8
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
9
9
  from datahub.ingestion.api.common import PipelineContext
10
+ from datahub.ingestion.api.decorators import SupportStatus, support_status
10
11
  from datahub.ingestion.api.source import Source, SourceReport
11
12
  from datahub.ingestion.api.workunit import MetadataWorkUnit
12
13
  from datahub.ingestion.source.bigquery_v2.bigquery_config import (
13
- BigQueryConnectionConfig,
14
14
  BigQueryFilterConfig,
15
15
  BigQueryIdentifierConfig,
16
16
  )
17
+ from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
18
+ BigQueryConnectionConfig,
19
+ )
17
20
  from datahub.ingestion.source.bigquery_v2.bigquery_report import (
18
21
  BigQueryQueriesExtractorReport,
19
22
  BigQuerySchemaApiPerfReport,
@@ -48,6 +51,7 @@ class BigQueryQueriesSourceConfig(
48
51
  )
49
52
 
50
53
 
54
+ @support_status(SupportStatus.CERTIFIED)
51
55
  class BigQueryQueriesSource(Source):
52
56
  def __init__(self, ctx: PipelineContext, config: BigQueryQueriesSourceConfig):
53
57
  self.ctx = ctx
@@ -92,3 +96,4 @@ class BigQueryQueriesSource(Source):
92
96
  def close(self) -> None:
93
97
  self.queries_extractor.close()
94
98
  self.connection.close()
99
+ super().close()
@@ -9,7 +9,6 @@ import pydantic
9
9
  from datahub.ingestion.api.report import Report
10
10
  from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
11
11
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
12
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
13
12
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
14
13
  from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
15
14
  from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
@@ -78,7 +77,6 @@ class BigQueryQueriesExtractorReport(Report):
78
77
  @dataclass
79
78
  class BigQueryV2Report(
80
79
  SQLSourceReport,
81
- IngestionStageReport,
82
80
  BaseTimeWindowReport,
83
81
  ClassificationReportMixin,
84
82
  ):
@@ -283,23 +283,30 @@ class BigQuerySchemaApi:
283
283
  with self.report.list_datasets_timer:
284
284
  self.report.num_list_datasets_api_requests += 1
285
285
  datasets = self.bq_client.list_datasets(project_id, max_results=maxResults)
286
- return [
287
- BigqueryDataset(
288
- name=d.dataset_id,
289
- labels=d.labels,
290
- location=(
291
- d._properties.get("location")
292
- if hasattr(d, "_properties") and isinstance(d._properties, dict)
293
- else None
294
- ),
295
- # TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
296
- # TODO: Given we are calling get_dataset for each dataset, we may consume and publish other fields too, such as created, modified, etc...
297
- # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
298
- # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
299
- comment=self.bq_client.get_dataset(d.reference).description,
286
+ result = []
287
+ for d in datasets:
288
+ # TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
289
+ # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
290
+ # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
291
+ dataset = self.bq_client.get_dataset(d.reference)
292
+
293
+ location = (
294
+ d._properties.get("location")
295
+ if hasattr(d, "_properties") and isinstance(d._properties, dict)
296
+ else None
297
+ )
298
+
299
+ result.append(
300
+ BigqueryDataset(
301
+ name=d.dataset_id,
302
+ labels=d.labels,
303
+ location=location,
304
+ comment=dataset.description,
305
+ created=dataset.created,
306
+ last_altered=dataset.modified,
307
+ )
300
308
  )
301
- for d in datasets
302
- ]
309
+ return result
303
310
 
304
311
  # This is not used anywhere
305
312
  def get_datasets_for_project_id_with_information_schema(
@@ -12,6 +12,7 @@ from datahub.emitter.mce_builder import (
12
12
  make_dataset_urn_with_platform_instance,
13
13
  make_schema_field_urn,
14
14
  make_tag_urn,
15
+ make_ts_millis,
15
16
  )
16
17
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
17
18
  from datahub.emitter.mcp_builder import BigQueryDatasetKey, ContainerKey, ProjectIdKey
@@ -65,7 +66,7 @@ from datahub.ingestion.source.sql.sql_utils import (
65
66
  )
66
67
  from datahub.ingestion.source_report.ingestion_stage import (
67
68
  METADATA_EXTRACTION,
68
- PROFILING,
69
+ IngestionHighStage,
69
70
  )
70
71
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
71
72
  Status,
@@ -286,6 +287,7 @@ class BigQuerySchemaGenerator:
286
287
  yield from gen_database_container(
287
288
  database=database,
288
289
  name=database,
290
+ qualified_name=database,
289
291
  sub_types=[DatasetContainerSubTypes.BIGQUERY_PROJECT],
290
292
  domain_registry=self.domain_registry,
291
293
  domain_config=self.config.domain,
@@ -299,6 +301,8 @@ class BigQuerySchemaGenerator:
299
301
  description: Optional[str] = None,
300
302
  tags: Optional[Dict[str, str]] = None,
301
303
  extra_properties: Optional[Dict[str, str]] = None,
304
+ created: Optional[int] = None,
305
+ last_modified: Optional[int] = None,
302
306
  ) -> Iterable[MetadataWorkUnit]:
303
307
  schema_container_key = self.gen_dataset_key(project_id, dataset)
304
308
 
@@ -332,6 +336,7 @@ class BigQuerySchemaGenerator:
332
336
  yield from gen_schema_container(
333
337
  database=project_id,
334
338
  schema=dataset,
339
+ qualified_name=f"{project_id}.{dataset}",
335
340
  sub_types=[DatasetContainerSubTypes.BIGQUERY_DATASET],
336
341
  domain_registry=self.domain_registry,
337
342
  domain_config=self.config.domain,
@@ -347,6 +352,8 @@ class BigQuerySchemaGenerator:
347
352
  ),
348
353
  tags=tags_joined,
349
354
  extra_properties=extra_properties,
355
+ created=created,
356
+ last_modified=last_modified,
350
357
  )
351
358
 
352
359
  def _process_project(
@@ -409,7 +416,7 @@ class BigQuerySchemaGenerator:
409
416
 
410
417
  if self.config.is_profiling_enabled():
411
418
  logger.info(f"Starting profiling project {project_id}")
412
- with self.report.new_stage(f"{project_id}: {PROFILING}"):
419
+ with self.report.new_high_stage(IngestionHighStage.PROFILING):
413
420
  yield from self.profiler.get_workunits(
414
421
  project_id=project_id,
415
422
  tables=db_tables,
@@ -442,10 +449,12 @@ class BigQuerySchemaGenerator:
442
449
  ):
443
450
  yield wu
444
451
  except Exception as e:
445
- if self.config.is_profiling_enabled():
446
- action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission, bigquery.tables.getData permission?"
452
+ # If configuration indicates we need table data access (for profiling or use_tables_list_query_v2),
453
+ # include bigquery.tables.getData in the error message since that's likely the missing permission
454
+ if self.config.have_table_data_read_permission:
455
+ action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list, bigquery.tables.getData permissions?"
447
456
  else:
448
- action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission?"
457
+ action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permissions?"
449
458
 
450
459
  self.report.failure(
451
460
  title="Unable to get tables for dataset",
@@ -482,6 +491,12 @@ class BigQuerySchemaGenerator:
482
491
  else None
483
492
  ),
484
493
  description=bigquery_dataset.comment,
494
+ created=make_ts_millis(bigquery_dataset.created)
495
+ if bigquery_dataset.created
496
+ else None,
497
+ last_modified=make_ts_millis(bigquery_dataset.last_altered)
498
+ if bigquery_dataset.last_altered
499
+ else None,
485
500
  )
486
501
 
487
502
  columns = None
@@ -63,7 +63,7 @@ class BigQueryIdentifierBuilder:
63
63
  )
64
64
 
65
65
  def gen_user_urn(self, user_email: str) -> str:
66
- return make_user_urn(user_email.split("@")[0])
66
+ return make_user_urn(user_email)
67
67
 
68
68
  def make_data_platform_urn(self) -> str:
69
69
  return make_data_platform_urn(self.platform)
@@ -375,7 +375,7 @@ class BigqueryLineageExtractor:
375
375
  memory_footprint.total_size(lineage)
376
376
  )
377
377
 
378
- for lineage_key in lineage.keys():
378
+ for lineage_key in lineage:
379
379
  # For views, we do not use the upstreams obtained by parsing audit logs
380
380
  # as they may contain indirectly referenced tables.
381
381
  if (
@@ -189,6 +189,7 @@ WHERE
189
189
 
190
190
  if len(profile_requests) == 0:
191
191
  return
192
+
192
193
  yield from self.generate_profile_workunits(
193
194
  profile_requests,
194
195
  max_workers=self.config.profiling.max_workers,
@@ -226,10 +227,11 @@ WHERE
226
227
  db_name, schema_name, bq_table, self.config.profiling.partition_datetime
227
228
  )
228
229
 
229
- if partition is None and bq_table.partition_info:
230
+ # For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
231
+ if partition is None and bq_table.partition_info and bq_table.rows_count:
230
232
  self.report.report_warning(
231
233
  title="Profile skipped for partitioned table",
232
- message="profile skipped as partitioned table is empty or partition id or type was invalid",
234
+ message="profile skipped as partition id or type was invalid",
233
235
  context=profile_request.pretty_name,
234
236
  )
235
237
  return None
@@ -45,12 +45,12 @@ SELECT
45
45
  tos.OPTION_VALUE as comment,
46
46
  t.is_insertable_into,
47
47
  t.ddl,
48
- ts.row_count,
48
+ ts.row_count as row_count,
49
49
  ts.size_bytes as bytes,
50
50
  p.num_partitions,
51
51
  p.max_partition_id,
52
- p.active_billable_bytes,
53
- p.long_term_billable_bytes,
52
+ p.active_billable_bytes as active_billable_bytes,
53
+ IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
54
54
  REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
55
55
  REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base
56
56