acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,13 @@
1
1
  import logging
2
- import os
3
2
  import re
3
+ from copy import deepcopy
4
4
  from datetime import timedelta
5
5
  from typing import Dict, List, Optional, Union
6
6
 
7
7
  from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
8
8
 
9
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
9
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
10
+ from datahub.configuration.env_vars import get_bigquery_schema_parallelism
10
11
  from datahub.configuration.source_common import (
11
12
  EnvConfigMixin,
12
13
  LowerCaseDatasetUrnConfigMixin,
@@ -24,15 +25,14 @@ from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, SQLFilterCo
24
25
  from datahub.ingestion.source.state.stateful_ingestion_base import (
25
26
  StatefulLineageConfigMixin,
26
27
  StatefulProfilingConfigMixin,
28
+ StatefulTimeWindowConfigMixin,
27
29
  StatefulUsageConfigMixin,
28
30
  )
29
31
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
30
32
 
31
33
  logger = logging.getLogger(__name__)
32
34
 
33
- DEFAULT_BQ_SCHEMA_PARALLELISM = int(
34
- os.getenv("DATAHUB_BIGQUERY_SCHEMA_PARALLELISM", 20)
35
- )
35
+ DEFAULT_BQ_SCHEMA_PARALLELISM = get_bigquery_schema_parallelism()
36
36
 
37
37
  # Regexp for sharded tables.
38
38
  # A sharded table is a table that has a suffix of the form _yyyymmdd or yyyymmdd, where yyyymmdd is a date.
@@ -73,8 +73,10 @@ class BigQueryBaseConfig(ConfigModel):
73
73
  ) from e
74
74
  return v
75
75
 
76
- @root_validator(pre=True, skip_on_failure=True)
76
+ @root_validator(pre=True)
77
77
  def project_id_backward_compatibility_configs_set(cls, values: Dict) -> Dict:
78
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
79
+ values = deepcopy(values)
78
80
  project_id = values.pop("project_id", None)
79
81
  project_ids = values.get("project_ids")
80
82
 
@@ -182,13 +184,14 @@ class BigQueryFilterConfig(SQLFilterConfig):
182
184
  )
183
185
 
184
186
  # NOTE: `schema_pattern` is added here only to hide it from docs.
185
- schema_pattern: AllowDenyPattern = Field(
187
+ schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
186
188
  default=AllowDenyPattern.allow_all(),
187
- hidden_from_docs=True,
188
189
  )
189
190
 
190
191
  @root_validator(pre=False, skip_on_failure=True)
191
192
  def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
193
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
194
+ values = deepcopy(values)
192
195
  dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
193
196
  schema_pattern = values.get("schema_pattern")
194
197
  if (
@@ -269,6 +272,7 @@ class BigQueryV2Config(
269
272
  SQLCommonConfig,
270
273
  StatefulUsageConfigMixin,
271
274
  StatefulLineageConfigMixin,
275
+ StatefulTimeWindowConfigMixin,
272
276
  StatefulProfilingConfigMixin,
273
277
  ClassificationSourceConfigMixin,
274
278
  ):
@@ -320,8 +324,7 @@ class BigQueryV2Config(
320
324
  description="Include full payload into events. It is only for debugging and internal use.",
321
325
  )
322
326
 
323
- number_of_datasets_process_in_batch: int = Field(
324
- hidden_from_docs=True,
327
+ number_of_datasets_process_in_batch: HiddenFromDocs[int] = Field(
325
328
  default=10000,
326
329
  description="Number of table queried in batch when getting metadata. This is a low level config property "
327
330
  "which should be touched with care.",
@@ -342,7 +345,7 @@ class BigQueryV2Config(
342
345
  )
343
346
 
344
347
  use_queries_v2: bool = Field(
345
- default=False,
348
+ default=True,
346
349
  description="If enabled, uses the new queries extractor to extract queries from bigquery.",
347
350
  )
348
351
  include_queries: bool = Field(
@@ -436,17 +439,15 @@ class BigQueryV2Config(
436
439
 
437
440
  upstream_lineage_in_report: bool = Field(
438
441
  default=False,
439
- description="Useful for debugging lineage information. Set to True to see the raw lineage created internally.",
442
+ description="Useful for debugging lineage information. Set to True to see the raw lineage created internally. Only works with legacy approach (`use_queries_v2: False`).",
440
443
  )
441
444
 
442
- run_optimized_column_query: bool = Field(
443
- hidden_from_docs=True,
445
+ run_optimized_column_query: HiddenFromDocs[bool] = Field(
444
446
  default=False,
445
447
  description="Run optimized column query to get column information. This is an experimental feature and may not work for all cases.",
446
448
  )
447
449
 
448
- file_backed_cache_size: int = Field(
449
- hidden_from_docs=True,
450
+ file_backed_cache_size: HiddenFromDocs[int] = Field(
450
451
  default=2000,
451
452
  description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
452
453
  )
@@ -456,10 +457,9 @@ class BigQueryV2Config(
456
457
  description="Option to exclude empty projects from being ingested.",
457
458
  )
458
459
 
459
- schema_resolution_batch_size: int = Field(
460
+ schema_resolution_batch_size: HiddenFromDocs[int] = Field(
460
461
  default=100,
461
462
  description="The number of tables to process in a batch when resolving schema from DataHub.",
462
- hidden_from_schema=True,
463
463
  )
464
464
 
465
465
  max_threads_dataset_parallelism: int = Field(
@@ -480,6 +480,8 @@ class BigQueryV2Config(
480
480
 
481
481
  @root_validator(pre=True)
482
482
  def set_include_schema_metadata(cls, values: Dict) -> Dict:
483
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
484
+ values = deepcopy(values)
483
485
  # Historically this is used to disable schema ingestion
484
486
  if (
485
487
  "include_tables" in values
@@ -498,6 +500,8 @@ class BigQueryV2Config(
498
500
 
499
501
  @root_validator(skip_on_failure=True)
500
502
  def profile_default_settings(cls, values: Dict) -> Dict:
503
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
504
+ values = deepcopy(values)
501
505
  # Extra default SQLAlchemy option for better connection pooling and threading.
502
506
  # https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
503
507
  values["options"].setdefault("max_overflow", -1)
@@ -515,9 +519,33 @@ class BigQueryV2Config(
515
519
 
516
520
  return v
517
521
 
522
+ @validator("upstream_lineage_in_report")
523
+ def validate_upstream_lineage_in_report(cls, v: bool, values: Dict) -> bool:
524
+ if v and values.get("use_queries_v2", True):
525
+ logging.warning(
526
+ "`upstream_lineage_in_report` is enabled but will be ignored because `use_queries_v2` is enabled."
527
+ "This debugging feature only works with the legacy lineage approach (`use_queries_v2: false`)."
528
+ )
529
+
530
+ return v
531
+
532
+ @root_validator(pre=False, skip_on_failure=True)
533
+ def validate_queries_v2_stateful_ingestion(cls, values: Dict) -> Dict:
534
+ if values.get("use_queries_v2"):
535
+ if values.get("enable_stateful_lineage_ingestion") or values.get(
536
+ "enable_stateful_usage_ingestion"
537
+ ):
538
+ logger.warning(
539
+ "enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion are deprecated "
540
+ "when using use_queries_v2=True. These configs only work with the legacy (non-queries v2) extraction path. "
541
+ "For queries v2, use enable_stateful_time_window instead to enable stateful ingestion "
542
+ "for the unified time window extraction (lineage + usage + operations + queries)."
543
+ )
544
+ return values
545
+
518
546
  def get_table_pattern(self, pattern: List[str]) -> str:
519
547
  return "|".join(pattern) if pattern else ""
520
548
 
521
- platform_instance_not_supported_for_bigquery = pydantic_removed_field(
549
+ _platform_instance_not_supported_for_bigquery = pydantic_removed_field(
522
550
  "platform_instance"
523
551
  )
@@ -2,16 +2,23 @@ import logging
2
2
  import os
3
3
  from typing import Any, Dict, Optional
4
4
 
5
+ from google.api_core.client_info import ClientInfo
5
6
  from google.cloud import bigquery, datacatalog_v1, resourcemanager_v3
6
7
  from google.cloud.logging_v2.client import Client as GCPLoggingClient
7
8
  from pydantic import Field, PrivateAttr
8
9
 
10
+ from datahub._version import __version__
9
11
  from datahub.configuration.common import ConfigModel
10
12
  from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
11
13
 
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
 
17
+ def _get_bigquery_client_info() -> ClientInfo:
18
+ """Get ClientInfo with DataHub user-agent for BigQuery client identification"""
19
+ return ClientInfo(user_agent=f"datahub/{__version__}")
20
+
21
+
15
22
  class BigQueryConnectionConfig(ConfigModel):
16
23
  credential: Optional[GCPCredential] = Field(
17
24
  default=None, description="BigQuery credential informations"
@@ -41,7 +48,11 @@ class BigQueryConnectionConfig(ConfigModel):
41
48
 
42
49
  def get_bigquery_client(self) -> bigquery.Client:
43
50
  client_options = self.extra_client_options
44
- return bigquery.Client(self.project_on_behalf, **client_options)
51
+ return bigquery.Client(
52
+ self.project_on_behalf,
53
+ client_info=_get_bigquery_client_info(),
54
+ **client_options,
55
+ )
45
56
 
46
57
  def get_projects_client(self) -> resourcemanager_v3.ProjectsClient:
47
58
  return resourcemanager_v3.ProjectsClient()
@@ -7,6 +7,7 @@ from typing_extensions import Self
7
7
 
8
8
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
9
9
  from datahub.ingestion.api.common import PipelineContext
10
+ from datahub.ingestion.api.decorators import SupportStatus, support_status
10
11
  from datahub.ingestion.api.source import Source, SourceReport
11
12
  from datahub.ingestion.api.workunit import MetadataWorkUnit
12
13
  from datahub.ingestion.source.bigquery_v2.bigquery_config import (
@@ -50,6 +51,7 @@ class BigQueryQueriesSourceConfig(
50
51
  )
51
52
 
52
53
 
54
+ @support_status(SupportStatus.CERTIFIED)
53
55
  class BigQueryQueriesSource(Source):
54
56
  def __init__(self, ctx: PipelineContext, config: BigQueryQueriesSourceConfig):
55
57
  self.ctx = ctx
@@ -94,3 +96,4 @@ class BigQueryQueriesSource(Source):
94
96
  def close(self) -> None:
95
97
  self.queries_extractor.close()
96
98
  self.connection.close()
99
+ super().close()
@@ -9,7 +9,6 @@ import pydantic
9
9
  from datahub.ingestion.api.report import Report
10
10
  from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
11
11
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
12
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
13
12
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
14
13
  from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
15
14
  from datahub.utilities.lossy_collections import LossyDict, LossyList, LossySet
@@ -78,7 +77,6 @@ class BigQueryQueriesExtractorReport(Report):
78
77
  @dataclass
79
78
  class BigQueryV2Report(
80
79
  SQLSourceReport,
81
- IngestionStageReport,
82
80
  BaseTimeWindowReport,
83
81
  ClassificationReportMixin,
84
82
  ):
@@ -283,23 +283,30 @@ class BigQuerySchemaApi:
283
283
  with self.report.list_datasets_timer:
284
284
  self.report.num_list_datasets_api_requests += 1
285
285
  datasets = self.bq_client.list_datasets(project_id, max_results=maxResults)
286
- return [
287
- BigqueryDataset(
288
- name=d.dataset_id,
289
- labels=d.labels,
290
- location=(
291
- d._properties.get("location")
292
- if hasattr(d, "_properties") and isinstance(d._properties, dict)
293
- else None
294
- ),
295
- # TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
296
- # TODO: Given we are calling get_dataset for each dataset, we may consume and publish other fields too, such as created, modified, etc...
297
- # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
298
- # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
299
- comment=self.bq_client.get_dataset(d.reference).description,
286
+ result = []
287
+ for d in datasets:
288
+ # TODO: Fetch dataset description individually impacts overall performance if the number of datasets is high (hundreds); instead we should fetch in batch for all datasets.
289
+ # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client#google_cloud_bigquery_client_Client_get_dataset
290
+ # https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.dataset.Dataset
291
+ dataset = self.bq_client.get_dataset(d.reference)
292
+
293
+ location = (
294
+ d._properties.get("location")
295
+ if hasattr(d, "_properties") and isinstance(d._properties, dict)
296
+ else None
297
+ )
298
+
299
+ result.append(
300
+ BigqueryDataset(
301
+ name=d.dataset_id,
302
+ labels=d.labels,
303
+ location=location,
304
+ comment=dataset.description,
305
+ created=dataset.created,
306
+ last_altered=dataset.modified,
307
+ )
300
308
  )
301
- for d in datasets
302
- ]
309
+ return result
303
310
 
304
311
  # This is not used anywhere
305
312
  def get_datasets_for_project_id_with_information_schema(
@@ -12,6 +12,7 @@ from datahub.emitter.mce_builder import (
12
12
  make_dataset_urn_with_platform_instance,
13
13
  make_schema_field_urn,
14
14
  make_tag_urn,
15
+ make_ts_millis,
15
16
  )
16
17
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
17
18
  from datahub.emitter.mcp_builder import BigQueryDatasetKey, ContainerKey, ProjectIdKey
@@ -65,7 +66,7 @@ from datahub.ingestion.source.sql.sql_utils import (
65
66
  )
66
67
  from datahub.ingestion.source_report.ingestion_stage import (
67
68
  METADATA_EXTRACTION,
68
- PROFILING,
69
+ IngestionHighStage,
69
70
  )
70
71
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
71
72
  Status,
@@ -286,6 +287,7 @@ class BigQuerySchemaGenerator:
286
287
  yield from gen_database_container(
287
288
  database=database,
288
289
  name=database,
290
+ qualified_name=database,
289
291
  sub_types=[DatasetContainerSubTypes.BIGQUERY_PROJECT],
290
292
  domain_registry=self.domain_registry,
291
293
  domain_config=self.config.domain,
@@ -299,6 +301,8 @@ class BigQuerySchemaGenerator:
299
301
  description: Optional[str] = None,
300
302
  tags: Optional[Dict[str, str]] = None,
301
303
  extra_properties: Optional[Dict[str, str]] = None,
304
+ created: Optional[int] = None,
305
+ last_modified: Optional[int] = None,
302
306
  ) -> Iterable[MetadataWorkUnit]:
303
307
  schema_container_key = self.gen_dataset_key(project_id, dataset)
304
308
 
@@ -332,6 +336,7 @@ class BigQuerySchemaGenerator:
332
336
  yield from gen_schema_container(
333
337
  database=project_id,
334
338
  schema=dataset,
339
+ qualified_name=f"{project_id}.{dataset}",
335
340
  sub_types=[DatasetContainerSubTypes.BIGQUERY_DATASET],
336
341
  domain_registry=self.domain_registry,
337
342
  domain_config=self.config.domain,
@@ -347,6 +352,8 @@ class BigQuerySchemaGenerator:
347
352
  ),
348
353
  tags=tags_joined,
349
354
  extra_properties=extra_properties,
355
+ created=created,
356
+ last_modified=last_modified,
350
357
  )
351
358
 
352
359
  def _process_project(
@@ -409,7 +416,7 @@ class BigQuerySchemaGenerator:
409
416
 
410
417
  if self.config.is_profiling_enabled():
411
418
  logger.info(f"Starting profiling project {project_id}")
412
- with self.report.new_stage(f"{project_id}: {PROFILING}"):
419
+ with self.report.new_high_stage(IngestionHighStage.PROFILING):
413
420
  yield from self.profiler.get_workunits(
414
421
  project_id=project_id,
415
422
  tables=db_tables,
@@ -442,10 +449,12 @@ class BigQuerySchemaGenerator:
442
449
  ):
443
450
  yield wu
444
451
  except Exception as e:
445
- if self.config.is_profiling_enabled():
446
- action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission, bigquery.tables.getData permission?"
452
+ # If configuration indicates we need table data access (for profiling or use_tables_list_query_v2),
453
+ # include bigquery.tables.getData in the error message since that's likely the missing permission
454
+ if self.config.have_table_data_read_permission:
455
+ action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list, bigquery.tables.getData permissions?"
447
456
  else:
448
- action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission?"
457
+ action_mesage = "Does your service account have bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permissions?"
449
458
 
450
459
  self.report.failure(
451
460
  title="Unable to get tables for dataset",
@@ -482,6 +491,12 @@ class BigQuerySchemaGenerator:
482
491
  else None
483
492
  ),
484
493
  description=bigquery_dataset.comment,
494
+ created=make_ts_millis(bigquery_dataset.created)
495
+ if bigquery_dataset.created
496
+ else None,
497
+ last_modified=make_ts_millis(bigquery_dataset.last_altered)
498
+ if bigquery_dataset.last_altered
499
+ else None,
485
500
  )
486
501
 
487
502
  columns = None
@@ -63,7 +63,7 @@ class BigQueryIdentifierBuilder:
63
63
  )
64
64
 
65
65
  def gen_user_urn(self, user_email: str) -> str:
66
- return make_user_urn(user_email.split("@")[0])
66
+ return make_user_urn(user_email)
67
67
 
68
68
  def make_data_platform_urn(self) -> str:
69
69
  return make_data_platform_urn(self.platform)
@@ -189,6 +189,7 @@ WHERE
189
189
 
190
190
  if len(profile_requests) == 0:
191
191
  return
192
+
192
193
  yield from self.generate_profile_workunits(
193
194
  profile_requests,
194
195
  max_workers=self.config.profiling.max_workers,
@@ -226,10 +227,11 @@ WHERE
226
227
  db_name, schema_name, bq_table, self.config.profiling.partition_datetime
227
228
  )
228
229
 
229
- if partition is None and bq_table.partition_info:
230
+ # For partitioned tables, if it has a row count but not a valid partition, that means something went wrong with the partition detection.
231
+ if partition is None and bq_table.partition_info and bq_table.rows_count:
230
232
  self.report.report_warning(
231
233
  title="Profile skipped for partitioned table",
232
- message="profile skipped as partitioned table is empty or partition id or type was invalid",
234
+ message="profile skipped as partition id or type was invalid",
233
235
  context=profile_request.pretty_name,
234
236
  )
235
237
  return None
@@ -45,12 +45,12 @@ SELECT
45
45
  tos.OPTION_VALUE as comment,
46
46
  t.is_insertable_into,
47
47
  t.ddl,
48
- ts.row_count,
48
+ ts.row_count as row_count,
49
49
  ts.size_bytes as bytes,
50
50
  p.num_partitions,
51
51
  p.max_partition_id,
52
- p.active_billable_bytes,
53
- p.long_term_billable_bytes,
52
+ p.active_billable_bytes as active_billable_bytes,
53
+ IFNULL(p.long_term_billable_bytes, 0) as long_term_billable_bytes,
54
54
  REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix,
55
55
  REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base
56
56
 
@@ -8,7 +8,7 @@ from typing import Collection, Dict, Iterable, List, Optional, TypedDict
8
8
  from google.cloud.bigquery import Client
9
9
  from pydantic import Field, PositiveInt
10
10
 
11
- from datahub.configuration.common import AllowDenyPattern
11
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
12
12
  from datahub.configuration.time_window_config import (
13
13
  BaseTimeWindowConfig,
14
14
  get_time_bucket,
@@ -36,6 +36,9 @@ from datahub.ingestion.source.bigquery_v2.common import (
36
36
  BigQueryFilter,
37
37
  BigQueryIdentifierBuilder,
38
38
  )
39
+ from datahub.ingestion.source.state.redundant_run_skip_handler import (
40
+ RedundantQueriesRunSkipHandler,
41
+ )
39
42
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
40
43
  from datahub.metadata.urns import CorpUserUrn
41
44
  from datahub.sql_parsing.schema_resolver import SchemaResolver
@@ -86,12 +89,11 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig):
86
89
  # TODO: Support stateful ingestion for the time windows.
87
90
  window: BaseTimeWindowConfig = BaseTimeWindowConfig()
88
91
 
89
- local_temp_path: Optional[pathlib.Path] = Field(
90
- default=None,
91
- description="Local path to store the audit log.",
92
+ local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = Field(
92
93
  # TODO: For now, this is simply an advanced config to make local testing easier.
93
94
  # Eventually, we will want to store date-specific files in the directory and use it as a cache.
94
- hidden_from_docs=True,
95
+ default=None,
96
+ description="Local path to store the audit log.",
95
97
  )
96
98
 
97
99
  user_email_pattern: AllowDenyPattern = Field(
@@ -136,6 +138,7 @@ class BigQueryQueriesExtractor(Closeable):
136
138
  structured_report: SourceReport,
137
139
  filters: BigQueryFilter,
138
140
  identifiers: BigQueryIdentifierBuilder,
141
+ redundant_run_skip_handler: Optional[RedundantQueriesRunSkipHandler] = None,
139
142
  graph: Optional[DataHubGraph] = None,
140
143
  schema_resolver: Optional[SchemaResolver] = None,
141
144
  discovered_tables: Optional[Collection[str]] = None,
@@ -159,6 +162,9 @@ class BigQueryQueriesExtractor(Closeable):
159
162
  )
160
163
 
161
164
  self.structured_report = structured_report
165
+ self.redundant_run_skip_handler = redundant_run_skip_handler
166
+
167
+ self.start_time, self.end_time = self._get_time_window()
162
168
 
163
169
  self.aggregator = SqlParsingAggregator(
164
170
  platform=self.identifiers.platform,
@@ -173,8 +179,8 @@ class BigQueryQueriesExtractor(Closeable):
173
179
  generate_query_usage_statistics=self.config.include_query_usage_statistics,
174
180
  usage_config=BaseUsageConfig(
175
181
  bucket_duration=self.config.window.bucket_duration,
176
- start_time=self.config.window.start_time,
177
- end_time=self.config.window.end_time,
182
+ start_time=self.start_time,
183
+ end_time=self.end_time,
178
184
  user_email_pattern=self.config.user_email_pattern,
179
185
  top_n_queries=self.config.top_n_queries,
180
186
  ),
@@ -200,6 +206,34 @@ class BigQueryQueriesExtractor(Closeable):
200
206
  logger.info(f"Using local temp path: {path}")
201
207
  return path
202
208
 
209
+ def _get_time_window(self) -> tuple[datetime, datetime]:
210
+ if self.redundant_run_skip_handler:
211
+ start_time, end_time = (
212
+ self.redundant_run_skip_handler.suggest_run_time_window(
213
+ self.config.window.start_time,
214
+ self.config.window.end_time,
215
+ )
216
+ )
217
+ else:
218
+ start_time = self.config.window.start_time
219
+ end_time = self.config.window.end_time
220
+
221
+ # Usage statistics are aggregated per bucket (typically per day).
222
+ # To ensure accurate aggregated metrics, we need to align the start_time
223
+ # to the beginning of a bucket so that we include complete bucket periods.
224
+ if self.config.include_usage_statistics:
225
+ start_time = get_time_bucket(start_time, self.config.window.bucket_duration)
226
+
227
+ return start_time, end_time
228
+
229
+ def _update_state(self) -> None:
230
+ if self.redundant_run_skip_handler:
231
+ self.redundant_run_skip_handler.update_state(
232
+ self.config.window.start_time,
233
+ self.config.window.end_time,
234
+ self.config.window.bucket_duration,
235
+ )
236
+
203
237
  def is_temp_table(self, name: str) -> bool:
204
238
  try:
205
239
  table = BigqueryTableIdentifier.from_string_name(name)
@@ -300,6 +334,8 @@ class BigQueryQueriesExtractor(Closeable):
300
334
  shared_connection.close()
301
335
  audit_log_file.unlink(missing_ok=True)
302
336
 
337
+ self._update_state()
338
+
303
339
  def deduplicate_queries(
304
340
  self, queries: FileBackedList[ObservedQuery]
305
341
  ) -> FileBackedDict[Dict[int, ObservedQuery]]:
@@ -356,8 +392,8 @@ class BigQueryQueriesExtractor(Closeable):
356
392
  query_log_query = _build_enriched_query_log_query(
357
393
  project_id=project.id,
358
394
  region=region,
359
- start_time=self.config.window.start_time,
360
- end_time=self.config.window.end_time,
395
+ start_time=self.start_time,
396
+ end_time=self.end_time,
361
397
  )
362
398
 
363
399
  logger.info(f"Fetching query log from BQ Project {project.id} for {region}")
@@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
80
80
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
81
81
  @capability(
82
82
  SourceCapability.DELETION_DETECTION,
83
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
83
+ "Enabled by default via stateful ingestion",
84
84
  supported=True,
85
85
  )
86
86
  class CassandraSource(StatefulIngestionSourceBase):
@@ -296,13 +296,11 @@ class CassandraSource(StatefulIngestionSourceBase):
296
296
  qualified_name=dataset_name,
297
297
  description=view.comment,
298
298
  custom_properties=self._get_dataset_custom_props(view),
299
- extra_aspects=[
300
- ViewPropertiesClass(
301
- materialized=True,
302
- viewLogic=view.where_clause, # Use the WHERE clause as view logic
303
- viewLanguage="CQL", # Use "CQL" as the language
304
- ),
305
- ],
299
+ view_definition=ViewPropertiesClass(
300
+ materialized=True,
301
+ viewLogic=view.where_clause, # Use the WHERE clause as view logic
302
+ viewLanguage="CQL", # Use "CQL" as the language
303
+ ),
306
304
  )
307
305
 
308
306
  # Construct and emit lineage off of 'base_table_name'
@@ -132,7 +132,23 @@ class CassandraAPI:
132
132
 
133
133
  ssl_context = None
134
134
  if self.config.ssl_ca_certs:
135
- ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
135
+ # Map SSL version string to ssl module constant
136
+ ssl_version_map = {
137
+ "TLS_CLIENT": ssl.PROTOCOL_TLS_CLIENT,
138
+ "TLSv1": ssl.PROTOCOL_TLSv1,
139
+ "TLSv1_1": ssl.PROTOCOL_TLSv1_1,
140
+ "TLSv1_2": ssl.PROTOCOL_TLSv1_2,
141
+ "TLSv1_3": ssl.PROTOCOL_TLSv1_2, # Python's ssl module uses TLSv1_2 for TLS 1.3
142
+ }
143
+
144
+ ssl_protocol = (
145
+ ssl_version_map.get(
146
+ self.config.ssl_version, ssl.PROTOCOL_TLS_CLIENT
147
+ )
148
+ if self.config.ssl_version
149
+ else ssl.PROTOCOL_TLS_CLIENT
150
+ )
151
+ ssl_context = ssl.SSLContext(ssl_protocol)
136
152
  ssl_context.load_verify_locations(self.config.ssl_ca_certs)
137
153
  if self.config.ssl_certfile and self.config.ssl_keyfile:
138
154
  ssl_context.load_cert_chain(
@@ -94,6 +94,11 @@ class CassandraSourceConfig(
94
94
  description="Path to the SSL key file for SSL connections.",
95
95
  )
96
96
 
97
+ ssl_version: Optional[str] = Field(
98
+ default="TLS_CLIENT",
99
+ description="SSL protocol version to use for connections. Options: TLS_CLIENT, TLSv1, TLSv1_1, TLSv1_2, TLSv1_3. Defaults to TLS_CLIENT.",
100
+ )
101
+
97
102
  keyspace_pattern: AllowDenyPattern = Field(
98
103
  default=AllowDenyPattern.allow_all(),
99
104
  description="Regex patterns to filter keyspaces for ingestion.",
@@ -18,7 +18,7 @@ from datahub.ingestion.source.cassandra.cassandra_api import (
18
18
  )
19
19
  from datahub.ingestion.source.cassandra.cassandra_config import CassandraSourceConfig
20
20
  from datahub.ingestion.source.cassandra.cassandra_utils import CassandraSourceReport
21
- from datahub.ingestion.source_report.ingestion_stage import PROFILING
21
+ from datahub.ingestion.source_report.ingestion_stage import IngestionHighStage
22
22
  from datahub.metadata.schema_classes import (
23
23
  DatasetFieldProfileClass,
24
24
  DatasetProfileClass,
@@ -70,11 +70,12 @@ class CassandraProfiler:
70
70
  ) -> Iterable[MetadataWorkUnit]:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
- with self.report.new_stage(
74
- f"{keyspace_name}: {PROFILING}"
75
- ), ThreadPoolExecutor(
76
- max_workers=self.config.profiling.max_workers
77
- ) as executor:
73
+ with (
74
+ self.report.new_high_stage(IngestionHighStage.PROFILING),
75
+ ThreadPoolExecutor(
76
+ max_workers=self.config.profiling.max_workers
77
+ ) as executor,
78
+ ):
78
79
  future_to_dataset = {
79
80
  executor.submit(
80
81
  self.generate_profile,
@@ -6,7 +6,6 @@ from datahub.ingestion.source.cassandra.cassandra_api import CassandraColumn
6
6
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
7
7
  StaleEntityRemovalSourceReport,
8
8
  )
9
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
10
9
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
11
10
  SchemaField,
12
11
  SchemaFieldDataType,
@@ -35,7 +34,7 @@ SYSTEM_KEYSPACE_LIST = set(
35
34
 
36
35
 
37
36
  @dataclass
38
- class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport):
37
+ class CassandraSourceReport(StaleEntityRemovalSourceReport):
39
38
  num_tables_failed: int = 0
40
39
  num_views_failed: int = 0
41
40
  tables_scanned: int = 0