acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,7 @@ from typing import Collection, Dict, Iterable, List, Optional, TypedDict
8
8
  from google.cloud.bigquery import Client
9
9
  from pydantic import Field, PositiveInt
10
10
 
11
- from datahub.configuration.common import AllowDenyPattern
11
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
12
12
  from datahub.configuration.time_window_config import (
13
13
  BaseTimeWindowConfig,
14
14
  get_time_bucket,
@@ -36,6 +36,9 @@ from datahub.ingestion.source.bigquery_v2.common import (
36
36
  BigQueryFilter,
37
37
  BigQueryIdentifierBuilder,
38
38
  )
39
+ from datahub.ingestion.source.state.redundant_run_skip_handler import (
40
+ RedundantQueriesRunSkipHandler,
41
+ )
39
42
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
40
43
  from datahub.metadata.urns import CorpUserUrn
41
44
  from datahub.sql_parsing.schema_resolver import SchemaResolver
@@ -86,12 +89,11 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig):
86
89
  # TODO: Support stateful ingestion for the time windows.
87
90
  window: BaseTimeWindowConfig = BaseTimeWindowConfig()
88
91
 
89
- local_temp_path: Optional[pathlib.Path] = Field(
90
- default=None,
91
- description="Local path to store the audit log.",
92
+ local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = Field(
92
93
  # TODO: For now, this is simply an advanced config to make local testing easier.
93
94
  # Eventually, we will want to store date-specific files in the directory and use it as a cache.
94
- hidden_from_docs=True,
95
+ default=None,
96
+ description="Local path to store the audit log.",
95
97
  )
96
98
 
97
99
  user_email_pattern: AllowDenyPattern = Field(
@@ -136,6 +138,7 @@ class BigQueryQueriesExtractor(Closeable):
136
138
  structured_report: SourceReport,
137
139
  filters: BigQueryFilter,
138
140
  identifiers: BigQueryIdentifierBuilder,
141
+ redundant_run_skip_handler: Optional[RedundantQueriesRunSkipHandler] = None,
139
142
  graph: Optional[DataHubGraph] = None,
140
143
  schema_resolver: Optional[SchemaResolver] = None,
141
144
  discovered_tables: Optional[Collection[str]] = None,
@@ -159,6 +162,9 @@ class BigQueryQueriesExtractor(Closeable):
159
162
  )
160
163
 
161
164
  self.structured_report = structured_report
165
+ self.redundant_run_skip_handler = redundant_run_skip_handler
166
+
167
+ self.start_time, self.end_time = self._get_time_window()
162
168
 
163
169
  self.aggregator = SqlParsingAggregator(
164
170
  platform=self.identifiers.platform,
@@ -173,8 +179,8 @@ class BigQueryQueriesExtractor(Closeable):
173
179
  generate_query_usage_statistics=self.config.include_query_usage_statistics,
174
180
  usage_config=BaseUsageConfig(
175
181
  bucket_duration=self.config.window.bucket_duration,
176
- start_time=self.config.window.start_time,
177
- end_time=self.config.window.end_time,
182
+ start_time=self.start_time,
183
+ end_time=self.end_time,
178
184
  user_email_pattern=self.config.user_email_pattern,
179
185
  top_n_queries=self.config.top_n_queries,
180
186
  ),
@@ -200,6 +206,34 @@ class BigQueryQueriesExtractor(Closeable):
200
206
  logger.info(f"Using local temp path: {path}")
201
207
  return path
202
208
 
209
+ def _get_time_window(self) -> tuple[datetime, datetime]:
210
+ if self.redundant_run_skip_handler:
211
+ start_time, end_time = (
212
+ self.redundant_run_skip_handler.suggest_run_time_window(
213
+ self.config.window.start_time,
214
+ self.config.window.end_time,
215
+ )
216
+ )
217
+ else:
218
+ start_time = self.config.window.start_time
219
+ end_time = self.config.window.end_time
220
+
221
+ # Usage statistics are aggregated per bucket (typically per day).
222
+ # To ensure accurate aggregated metrics, we need to align the start_time
223
+ # to the beginning of a bucket so that we include complete bucket periods.
224
+ if self.config.include_usage_statistics:
225
+ start_time = get_time_bucket(start_time, self.config.window.bucket_duration)
226
+
227
+ return start_time, end_time
228
+
229
+ def _update_state(self) -> None:
230
+ if self.redundant_run_skip_handler:
231
+ self.redundant_run_skip_handler.update_state(
232
+ self.config.window.start_time,
233
+ self.config.window.end_time,
234
+ self.config.window.bucket_duration,
235
+ )
236
+
203
237
  def is_temp_table(self, name: str) -> bool:
204
238
  try:
205
239
  table = BigqueryTableIdentifier.from_string_name(name)
@@ -300,6 +334,8 @@ class BigQueryQueriesExtractor(Closeable):
300
334
  shared_connection.close()
301
335
  audit_log_file.unlink(missing_ok=True)
302
336
 
337
+ self._update_state()
338
+
303
339
  def deduplicate_queries(
304
340
  self, queries: FileBackedList[ObservedQuery]
305
341
  ) -> FileBackedDict[Dict[int, ObservedQuery]]:
@@ -356,8 +392,8 @@ class BigQueryQueriesExtractor(Closeable):
356
392
  query_log_query = _build_enriched_query_log_query(
357
393
  project_id=project.id,
358
394
  region=region,
359
- start_time=self.config.window.start_time,
360
- end_time=self.config.window.end_time,
395
+ start_time=self.start_time,
396
+ end_time=self.end_time,
361
397
  )
362
398
 
363
399
  logger.info(f"Fetching query log from BQ Project {project.id} for {region}")
@@ -80,7 +80,7 @@ class KeyspaceKey(ContainerKey):
80
80
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
81
81
  @capability(
82
82
  SourceCapability.DELETION_DETECTION,
83
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
83
+ "Enabled by default via stateful ingestion",
84
84
  supported=True,
85
85
  )
86
86
  class CassandraSource(StatefulIngestionSourceBase):
@@ -123,16 +123,7 @@ class CassandraSource(StatefulIngestionSourceBase):
123
123
  ).workunit_processor,
124
124
  ]
125
125
 
126
- def get_workunits_internal(
127
- self,
128
- ) -> Iterable[MetadataWorkUnit]:
129
- for metadata in self._get_metadata():
130
- if isinstance(metadata, MetadataWorkUnit):
131
- yield metadata
132
- else:
133
- yield from metadata.as_workunits()
134
-
135
- def _get_metadata(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
126
+ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
136
127
  if not self.cassandra_api.authenticate():
137
128
  return
138
129
  keyspaces: List[CassandraKeyspace] = self.cassandra_api.get_keyspaces()
@@ -305,13 +296,11 @@ class CassandraSource(StatefulIngestionSourceBase):
305
296
  qualified_name=dataset_name,
306
297
  description=view.comment,
307
298
  custom_properties=self._get_dataset_custom_props(view),
308
- extra_aspects=[
309
- ViewPropertiesClass(
310
- materialized=True,
311
- viewLogic=view.where_clause, # Use the WHERE clause as view logic
312
- viewLanguage="CQL", # Use "CQL" as the language
313
- ),
314
- ],
299
+ view_definition=ViewPropertiesClass(
300
+ materialized=True,
301
+ viewLogic=view.where_clause, # Use the WHERE clause as view logic
302
+ viewLanguage="CQL", # Use "CQL" as the language
303
+ ),
315
304
  )
316
305
 
317
306
  # Construct and emit lineage off of 'base_table_name'
@@ -1,3 +1,4 @@
1
+ import ssl
1
2
  from dataclasses import dataclass, field
2
3
  from typing import Any, Dict, List, Optional
3
4
 
@@ -128,6 +129,39 @@ class CassandraAPI:
128
129
 
129
130
  self._cassandra_session = cluster.connect()
130
131
  return True
132
+
133
+ ssl_context = None
134
+ if self.config.ssl_ca_certs:
135
+ # Map SSL version string to ssl module constant
136
+ ssl_version_map = {
137
+ "TLS_CLIENT": ssl.PROTOCOL_TLS_CLIENT,
138
+ "TLSv1": ssl.PROTOCOL_TLSv1,
139
+ "TLSv1_1": ssl.PROTOCOL_TLSv1_1,
140
+ "TLSv1_2": ssl.PROTOCOL_TLSv1_2,
141
+ "TLSv1_3": ssl.PROTOCOL_TLSv1_2, # Python's ssl module uses TLSv1_2 for TLS 1.3
142
+ }
143
+
144
+ ssl_protocol = (
145
+ ssl_version_map.get(
146
+ self.config.ssl_version, ssl.PROTOCOL_TLS_CLIENT
147
+ )
148
+ if self.config.ssl_version
149
+ else ssl.PROTOCOL_TLS_CLIENT
150
+ )
151
+ ssl_context = ssl.SSLContext(ssl_protocol)
152
+ ssl_context.load_verify_locations(self.config.ssl_ca_certs)
153
+ if self.config.ssl_certfile and self.config.ssl_keyfile:
154
+ ssl_context.load_cert_chain(
155
+ certfile=self.config.ssl_certfile,
156
+ keyfile=self.config.ssl_keyfile,
157
+ )
158
+ elif self.config.ssl_certfile or self.config.ssl_keyfile:
159
+ # If one is provided, the other must be too.
160
+ # This is a simplification; real-world scenarios might allow one without the other depending on setup.
161
+ raise ValueError(
162
+ "Both ssl_certfile and ssl_keyfile must be provided if one is specified."
163
+ )
164
+
131
165
  if self.config.username and self.config.password:
132
166
  auth_provider = PlainTextAuthProvider(
133
167
  username=self.config.username, password=self.config.password
@@ -136,12 +170,14 @@ class CassandraAPI:
136
170
  [self.config.contact_point],
137
171
  port=self.config.port,
138
172
  auth_provider=auth_provider,
173
+ ssl_context=ssl_context,
139
174
  load_balancing_policy=None,
140
175
  )
141
176
  else:
142
177
  cluster = Cluster(
143
178
  [self.config.contact_point],
144
179
  port=self.config.port,
180
+ ssl_context=ssl_context,
145
181
  load_balancing_policy=None,
146
182
  )
147
183
 
@@ -79,6 +79,26 @@ class CassandraSourceConfig(
79
79
  description="Configuration for cloud-based Cassandra, such as DataStax Astra DB.",
80
80
  )
81
81
 
82
+ ssl_ca_certs: Optional[str] = Field(
83
+ default=None,
84
+ description="Path to the CA certificate file for SSL connections.",
85
+ )
86
+
87
+ ssl_certfile: Optional[str] = Field(
88
+ default=None,
89
+ description="Path to the SSL certificate file for SSL connections.",
90
+ )
91
+
92
+ ssl_keyfile: Optional[str] = Field(
93
+ default=None,
94
+ description="Path to the SSL key file for SSL connections.",
95
+ )
96
+
97
+ ssl_version: Optional[str] = Field(
98
+ default="TLS_CLIENT",
99
+ description="SSL protocol version to use for connections. Options: TLS_CLIENT, TLSv1, TLSv1_1, TLSv1_2, TLSv1_3. Defaults to TLS_CLIENT.",
100
+ )
101
+
82
102
  keyspace_pattern: AllowDenyPattern = Field(
83
103
  default=AllowDenyPattern.allow_all(),
84
104
  description="Regex patterns to filter keyspaces for ingestion.",
@@ -18,7 +18,7 @@ from datahub.ingestion.source.cassandra.cassandra_api import (
18
18
  )
19
19
  from datahub.ingestion.source.cassandra.cassandra_config import CassandraSourceConfig
20
20
  from datahub.ingestion.source.cassandra.cassandra_utils import CassandraSourceReport
21
- from datahub.ingestion.source_report.ingestion_stage import PROFILING
21
+ from datahub.ingestion.source_report.ingestion_stage import IngestionHighStage
22
22
  from datahub.metadata.schema_classes import (
23
23
  DatasetFieldProfileClass,
24
24
  DatasetProfileClass,
@@ -70,30 +70,32 @@ class CassandraProfiler:
70
70
  ) -> Iterable[MetadataWorkUnit]:
71
71
  for keyspace_name in cassandra_data.keyspaces:
72
72
  tables = cassandra_data.tables.get(keyspace_name, [])
73
- with self.report.new_stage(f"{keyspace_name}: {PROFILING}"):
74
- with ThreadPoolExecutor(
73
+ with (
74
+ self.report.new_high_stage(IngestionHighStage.PROFILING),
75
+ ThreadPoolExecutor(
75
76
  max_workers=self.config.profiling.max_workers
76
- ) as executor:
77
- future_to_dataset = {
78
- executor.submit(
79
- self.generate_profile,
80
- keyspace_name,
81
- table_name,
82
- cassandra_data.columns.get(table_name, []),
83
- ): table_name
84
- for table_name in tables
85
- }
86
- for future in as_completed(future_to_dataset):
87
- table_name = future_to_dataset[future]
88
- try:
89
- yield from future.result()
90
- except Exception as exc:
91
- self.report.profiling_skipped_other[table_name] += 1
92
- self.report.failure(
93
- message="Failed to profile for table",
94
- context=f"{keyspace_name}.{table_name}",
95
- exc=exc,
96
- )
77
+ ) as executor,
78
+ ):
79
+ future_to_dataset = {
80
+ executor.submit(
81
+ self.generate_profile,
82
+ keyspace_name,
83
+ table_name,
84
+ cassandra_data.columns.get(table_name, []),
85
+ ): table_name
86
+ for table_name in tables
87
+ }
88
+ for future in as_completed(future_to_dataset):
89
+ table_name = future_to_dataset[future]
90
+ try:
91
+ yield from future.result()
92
+ except Exception as exc:
93
+ self.report.profiling_skipped_other[table_name] += 1
94
+ self.report.failure(
95
+ message="Failed to profile for table",
96
+ context=f"{keyspace_name}.{table_name}",
97
+ exc=exc,
98
+ )
97
99
 
98
100
  def generate_profile(
99
101
  self,
@@ -6,7 +6,6 @@ from datahub.ingestion.source.cassandra.cassandra_api import CassandraColumn
6
6
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
7
7
  StaleEntityRemovalSourceReport,
8
8
  )
9
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
10
9
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
11
10
  SchemaField,
12
11
  SchemaFieldDataType,
@@ -35,7 +34,7 @@ SYSTEM_KEYSPACE_LIST = set(
35
34
 
36
35
 
37
36
  @dataclass
38
- class CassandraSourceReport(StaleEntityRemovalSourceReport, IngestionStageReport):
37
+ class CassandraSourceReport(StaleEntityRemovalSourceReport):
39
38
  num_tables_failed: int = 0
40
39
  num_views_failed: int = 0
41
40
  tables_scanned: int = 0
@@ -0,0 +1,23 @@
1
+ # This is a pretty limited list, and is not really complete yet. Right now it's only used to allow
2
+ # automatic platform mapping when generating lineage and we have a manual override, so
3
+ # it being incomplete is ok. This should not be used for urn validation.
4
+ KNOWN_VALID_PLATFORM_NAMES = [
5
+ "bigquery",
6
+ "cassandra",
7
+ "databricks",
8
+ "delta-lake",
9
+ "dbt",
10
+ "feast",
11
+ "file",
12
+ "gcs",
13
+ "hdfs",
14
+ "hive",
15
+ "mssql",
16
+ "mysql",
17
+ "oracle",
18
+ "postgres",
19
+ "redshift",
20
+ "s3",
21
+ "sagemaker",
22
+ "snowflake",
23
+ ]
@@ -9,7 +9,9 @@ from datahub.configuration.validate_multiline_string import pydantic_multiline_s
9
9
 
10
10
 
11
11
  class GCPCredential(ConfigModel):
12
- project_id: Optional[str] = Field(description="Project id to set the credentials")
12
+ project_id: Optional[str] = Field(
13
+ None, description="Project id to set the credentials"
14
+ )
13
15
  private_key_id: str = Field(description="Private key id")
14
16
  private_key: str = Field(
15
17
  description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"
@@ -51,3 +53,9 @@ class GCPCredential(ConfigModel):
51
53
  cred_json = json.dumps(configs, indent=4, separators=(",", ": "))
52
54
  fp.write(cred_json.encode())
53
55
  return fp.name
56
+
57
+ def to_dict(self, project_id: Optional[str] = None) -> Dict[str, str]:
58
+ configs = self.dict()
59
+ if project_id:
60
+ configs["project_id"] = project_id
61
+ return configs
@@ -1,5 +1,10 @@
1
+ import logging
2
+ from typing import Any, Dict
3
+
1
4
  from datahub.utilities.str_enum import StrEnum
2
5
 
6
+ logger = logging.getLogger(__name__)
7
+
3
8
 
4
9
  class DatasetSubTypes(StrEnum):
5
10
  # Generic SubTypes
@@ -25,6 +30,12 @@ class DatasetSubTypes(StrEnum):
25
30
  NEO4J_NODE = "Neo4j Node"
26
31
  NEO4J_RELATIONSHIP = "Neo4j Relationship"
27
32
  SNOWFLAKE_STREAM = "Snowflake Stream"
33
+ DYNAMIC_TABLE = "Dynamic Table"
34
+ API_ENDPOINT = "API Endpoint"
35
+ SLACK_CHANNEL = "Slack Channel"
36
+ PROJECTIONS = "Projections"
37
+ GOOGLE_SHEETS = "Google Sheets"
38
+ GOOGLE_SHEETS_NAMED_RANGE = "Google Sheets Named Range"
28
39
 
29
40
  # TODO: Create separate entity...
30
41
  NOTEBOOK = "Notebook"
@@ -44,13 +55,19 @@ class DatasetContainerSubTypes(StrEnum):
44
55
  GCS_BUCKET = "GCS bucket"
45
56
  ABS_CONTAINER = "ABS container"
46
57
  KEYSPACE = "Keyspace" # Cassandra
58
+ NAMESPACE = "Namespace" # Iceberg
59
+ DREMIO_SPACE = "Dremio Space"
60
+ DREMIO_SOURCE = "Dremio Source"
47
61
 
48
62
 
49
63
  class BIContainerSubTypes(StrEnum):
50
64
  LOOKER_FOLDER = "Folder"
51
65
  LOOKML_PROJECT = "LookML Project"
52
66
  LOOKML_MODEL = "LookML Model"
67
+ TABLEAU_SITE = "Site"
68
+ TABLEAU_PROJECT = "Project"
53
69
  TABLEAU_WORKBOOK = "Workbook"
70
+ POWERBI_WORKSPACE = "Workspace"
54
71
  POWERBI_DATASET = "Semantic Model"
55
72
  POWERBI_DATASET_TABLE = "Table"
56
73
  QLIK_SPACE = "Qlik Space"
@@ -58,6 +75,8 @@ class BIContainerSubTypes(StrEnum):
58
75
  SIGMA_WORKSPACE = "Sigma Workspace"
59
76
  SIGMA_WORKBOOK = "Sigma Workbook"
60
77
  MODE_COLLECTION = "Collection"
78
+ GRAFANA_FOLDER = "Folder"
79
+ GRAFANA_DASHBOARD = "Dashboard"
61
80
 
62
81
 
63
82
  class FlowContainerSubTypes(StrEnum):
@@ -68,10 +87,13 @@ class FlowContainerSubTypes(StrEnum):
68
87
  class JobContainerSubTypes(StrEnum):
69
88
  NIFI_PROCESS_GROUP = "Process Group"
70
89
  MSSQL_JOBSTEP = "Job Step"
71
- MSSQL_STORED_PROCEDURE = "Stored Procedure"
90
+ STORED_PROCEDURE = "Stored Procedure"
72
91
 
73
92
 
74
93
  class BIAssetSubTypes(StrEnum):
94
+ DASHBOARD = "Dashboard"
95
+ CHART = "Chart"
96
+
75
97
  # Generic SubTypes
76
98
  REPORT = "Report"
77
99
 
@@ -93,7 +115,57 @@ class BIAssetSubTypes(StrEnum):
93
115
  SAC_STORY = "Story"
94
116
  SAC_APPLICATION = "Application"
95
117
 
118
+ # Hex
119
+ HEX_PROJECT = "Project"
120
+ HEX_COMPONENT = "Component"
121
+
96
122
 
97
123
  class MLAssetSubTypes(StrEnum):
98
124
  MLFLOW_TRAINING_RUN = "ML Training Run"
99
125
  MLFLOW_EXPERIMENT = "ML Experiment"
126
+ VERTEX_EXPERIMENT = "Experiment"
127
+ VERTEX_EXPERIMENT_RUN = "Experiment Run"
128
+ VERTEX_EXECUTION = "Execution"
129
+
130
+ VERTEX_MODEL = "ML Model"
131
+ VERTEX_MODEL_GROUP = "ML Model Group"
132
+ VERTEX_TRAINING_JOB = "Training Job"
133
+ VERTEX_ENDPOINT = "Endpoint"
134
+ VERTEX_DATASET = "Dataset"
135
+ VERTEX_PROJECT = "Project"
136
+ VERTEX_PIPELINE = "Pipeline Job"
137
+ VERTEX_PIPELINE_TASK = "Pipeline Task"
138
+ VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
139
+
140
+
141
+ def create_source_capability_modifier_enum():
142
+ all_values: Dict[str, Any] = {}
143
+ source_enums = [
144
+ DatasetSubTypes,
145
+ DatasetContainerSubTypes,
146
+ BIContainerSubTypes,
147
+ FlowContainerSubTypes,
148
+ JobContainerSubTypes,
149
+ BIAssetSubTypes,
150
+ MLAssetSubTypes,
151
+ ]
152
+
153
+ for enum_class in source_enums:
154
+ for member in enum_class: # type: ignore[var-annotated]
155
+ if member.name in all_values:
156
+ logger.debug(
157
+ f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
158
+ )
159
+ continue
160
+ all_values[member.name] = member.value
161
+
162
+ enum_code = "class SourceCapabilityModifier(StrEnum):\n"
163
+ for name, value in all_values.items():
164
+ enum_code += f' {name} = "{value}"\n'
165
+
166
+ exec(enum_code, globals())
167
+ return globals()["SourceCapabilityModifier"]
168
+
169
+
170
+ # This will have all values from the enums above
171
+ SourceCapabilityModifier = create_source_capability_modifier_enum()
@@ -11,23 +11,30 @@ from datahub.emitter.mcp_builder import (
11
11
  )
12
12
  from datahub.ingestion.api.workunit import MetadataWorkUnit
13
13
  from datahub.ingestion.source.aws.s3_util import (
14
- get_bucket_name,
15
14
  get_bucket_relative_path,
16
15
  get_s3_prefix,
17
16
  is_s3_uri,
18
17
  )
19
18
  from datahub.ingestion.source.azure.abs_utils import (
20
19
  get_abs_prefix,
21
- get_container_name,
22
20
  get_container_relative_path,
23
21
  is_abs_uri,
24
22
  )
25
23
  from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
24
+ from datahub.ingestion.source.data_lake_common.object_store import (
25
+ get_object_store_bucket_name,
26
+ get_object_store_for_uri,
27
+ )
28
+ from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
26
29
  from datahub.ingestion.source.gcs.gcs_utils import (
27
- get_gcs_bucket_name,
28
30
  get_gcs_prefix,
29
31
  is_gcs_uri,
30
32
  )
33
+ from datahub.metadata.schema_classes import (
34
+ SchemaFieldClass,
35
+ SchemaFieldDataTypeClass,
36
+ StringTypeClass,
37
+ )
31
38
 
32
39
  # hide annoying debug errors from py4j
33
40
  logging.getLogger("py4j").setLevel(logging.ERROR)
@@ -38,6 +45,37 @@ PLATFORM_GCS = "gcs"
38
45
  PLATFORM_ABS = "abs"
39
46
 
40
47
 
48
+ def add_partition_columns_to_schema(
49
+ path_spec: PathSpec, full_path: str, fields: List[SchemaFieldClass]
50
+ ) -> None:
51
+ # Check if using fieldPath v2 format
52
+ is_fieldpath_v2 = any(
53
+ field.fieldPath.startswith("[version=2.0]") for field in fields
54
+ )
55
+
56
+ # Extract partition information from path
57
+ partition_keys = path_spec.get_partition_from_path(full_path)
58
+ if not partition_keys:
59
+ return
60
+
61
+ # Add partition fields to schema
62
+ for partition_key in partition_keys:
63
+ fields.append(
64
+ SchemaFieldClass(
65
+ fieldPath=(
66
+ f"{partition_key[0]}"
67
+ if not is_fieldpath_v2
68
+ else f"[version=2.0].[type=string].{partition_key[0]}"
69
+ ),
70
+ nativeDataType="string",
71
+ type=SchemaFieldDataTypeClass(StringTypeClass()),
72
+ isPartitioningKey=True,
73
+ nullable=False,
74
+ recursive=False,
75
+ )
76
+ )
77
+
78
+
41
79
  class ContainerWUCreator:
42
80
  processed_containers: List[str]
43
81
 
@@ -87,6 +125,13 @@ class ContainerWUCreator:
87
125
 
88
126
  @staticmethod
89
127
  def get_protocol(path: str) -> str:
128
+ object_store = get_object_store_for_uri(path)
129
+ if object_store:
130
+ prefix = object_store.get_prefix(path)
131
+ if prefix:
132
+ return prefix
133
+
134
+ # Legacy fallback
90
135
  protocol: Optional[str] = None
91
136
  if is_s3_uri(path):
92
137
  protocol = get_s3_prefix(path)
@@ -104,13 +149,12 @@ class ContainerWUCreator:
104
149
 
105
150
  @staticmethod
106
151
  def get_bucket_name(path: str) -> str:
107
- if is_s3_uri(path):
108
- return get_bucket_name(path)
109
- elif is_gcs_uri(path):
110
- return get_gcs_bucket_name(path)
111
- elif is_abs_uri(path):
112
- return get_container_name(path)
113
- raise ValueError(f"Unable to get bucket name from path: {path}")
152
+ """
153
+ Get the bucket/container name from any supported object store URI.
154
+
155
+ Delegates to the abstract get_object_store_bucket_name function.
156
+ """
157
+ return get_object_store_bucket_name(path)
114
158
 
115
159
  def get_sub_types(self) -> str:
116
160
  if self.platform == PLATFORM_S3:
@@ -122,6 +166,11 @@ class ContainerWUCreator:
122
166
  raise ValueError(f"Unable to sub type for platform: {self.platform}")
123
167
 
124
168
  def get_base_full_path(self, path: str) -> str:
169
+ object_store = get_object_store_for_uri(path)
170
+ if object_store:
171
+ return object_store.get_object_key(path)
172
+
173
+ # Legacy fallback
125
174
  if self.platform == "s3" or self.platform == "gcs":
126
175
  return get_bucket_relative_path(path)
127
176
  elif self.platform == "abs":