acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import json
2
2
  import logging
3
3
  import threading
4
4
  import uuid
5
+ from functools import partial
5
6
  from typing import Any, Dict, Iterable, List, Optional, Tuple
6
7
 
7
8
  from dateutil import parser as dateutil_parser
@@ -11,11 +12,11 @@ from pyiceberg.exceptions import (
11
12
  NoSuchNamespaceError,
12
13
  NoSuchPropertyException,
13
14
  NoSuchTableError,
14
- ServerError,
15
+ RESTError,
15
16
  )
16
17
  from pyiceberg.schema import Schema, SchemaVisitorPerPrimitiveType, visit
17
18
  from pyiceberg.table import Table
18
- from pyiceberg.typedef import Identifier
19
+ from pyiceberg.typedef import Identifier, Properties
19
20
  from pyiceberg.types import (
20
21
  BinaryType,
21
22
  BooleanType,
@@ -38,6 +39,7 @@ from pyiceberg.types import (
38
39
  )
39
40
 
40
41
  from datahub.emitter.mce_builder import (
42
+ make_container_urn,
41
43
  make_data_platform_urn,
42
44
  make_dataplatform_instance_urn,
43
45
  make_dataset_urn_with_platform_instance,
@@ -45,6 +47,13 @@ from datahub.emitter.mce_builder import (
45
47
  make_user_urn,
46
48
  )
47
49
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
50
+ from datahub.emitter.mcp_builder import NamespaceKey
51
+ from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
52
+ auto_patch_last_modified,
53
+ )
54
+ from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
55
+ EnsureAspectSizeProcessor,
56
+ )
48
57
  from datahub.ingestion.api.common import PipelineContext
49
58
  from datahub.ingestion.api.decorators import (
50
59
  SourceCapability,
@@ -55,8 +64,20 @@ from datahub.ingestion.api.decorators import (
55
64
  support_status,
56
65
  )
57
66
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
67
+ from datahub.ingestion.api.source_helpers import (
68
+ AutoSystemMetadata,
69
+ auto_fix_duplicate_schema_field_paths,
70
+ auto_fix_empty_field_paths,
71
+ auto_lowercase_urns,
72
+ auto_materialize_referenced_tags_terms,
73
+ auto_workunit_reporter,
74
+ )
58
75
  from datahub.ingestion.api.workunit import MetadataWorkUnit
59
76
  from datahub.ingestion.extractor import schema_util
77
+ from datahub.ingestion.source.common.subtypes import (
78
+ DatasetContainerSubTypes,
79
+ DatasetSubTypes,
80
+ )
60
81
  from datahub.ingestion.source.iceberg.iceberg_common import (
61
82
  IcebergSourceConfig,
62
83
  IcebergSourceReport,
@@ -68,21 +89,24 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
68
89
  from datahub.ingestion.source.state.stateful_ingestion_base import (
69
90
  StatefulIngestionSourceBase,
70
91
  )
71
- from datahub.metadata.com.linkedin.pegasus2avro.common import Status
72
- from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
73
- from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
92
+ from datahub.metadata.com.linkedin.pegasus2avro.common import Status, SubTypes
93
+ from datahub.metadata.com.linkedin.pegasus2avro.container import ContainerProperties
74
94
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
75
95
  OtherSchema,
76
96
  SchemaField,
77
97
  SchemaMetadata,
78
98
  )
79
99
  from datahub.metadata.schema_classes import (
100
+ BrowsePathEntryClass,
101
+ BrowsePathsV2Class,
102
+ ContainerClass,
80
103
  DataPlatformInstanceClass,
81
104
  DatasetPropertiesClass,
82
105
  OwnerClass,
83
106
  OwnershipClass,
84
107
  OwnershipTypeClass,
85
108
  TimeStampClass,
109
+ _Aspect,
86
110
  )
87
111
  from datahub.utilities.perf_timer import PerfTimer
88
112
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -94,7 +118,7 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
94
118
 
95
119
 
96
120
  @platform_name("Iceberg")
97
- @support_status(SupportStatus.TESTING)
121
+ @support_status(SupportStatus.INCUBATING)
98
122
  @config_class(IcebergSourceConfig)
99
123
  @capability(
100
124
  SourceCapability.PLATFORM_INSTANCE,
@@ -110,7 +134,9 @@ logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(
110
134
  SourceCapability.OWNERSHIP,
111
135
  "Automatically ingests ownership information from table properties based on `user_ownership_property` and `group_ownership_property`",
112
136
  )
113
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
137
+ @capability(
138
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
139
+ )
114
140
  class IcebergSource(StatefulIngestionSourceBase):
115
141
  """
116
142
  ## Integration Details
@@ -121,11 +147,17 @@ class IcebergSource(StatefulIngestionSourceBase):
121
147
  [pyiceberg library](https://py.iceberg.apache.org/).
122
148
  """
123
149
 
150
+ platform: str = "iceberg"
151
+
124
152
  def __init__(self, config: IcebergSourceConfig, ctx: PipelineContext) -> None:
125
153
  super().__init__(config, ctx)
126
- self.platform: str = "iceberg"
127
154
  self.report: IcebergSourceReport = IcebergSourceReport()
128
155
  self.config: IcebergSourceConfig = config
156
+ self.ctx: PipelineContext = ctx
157
+ self.stamping_processor = AutoSystemMetadata(
158
+ self.ctx
159
+ ) # single instance used only when processing namespaces
160
+ self.namespaces: List[Tuple[Identifier, str]] = []
129
161
 
130
162
  @classmethod
131
163
  def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource":
@@ -133,20 +165,58 @@ class IcebergSource(StatefulIngestionSourceBase):
133
165
  return cls(config, ctx)
134
166
 
135
167
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
168
+ # This source needs to overwrite standard `get_workunit_processor`, because it is unique in terms of usage
169
+ # of parallelism. Because of this, 2 processors won't work as expected:
170
+ # 1. browse_path_processor - it needs aspects for a single entity to be continuous - which is not guaranteed
171
+ # in this source
172
+ # 2. automatic stamping with systemMetadata - in current implementation of the Source class this processor
173
+ # would have been applied in a thread (single) shared between the source, processors and transformers.
174
+ # Since the metadata scraping happens in separate threads, this could lead to difference between
175
+ # time used by systemMetadata and actual time at which metadata was read
176
+ auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
177
+ if (
178
+ self.ctx.pipeline_config
179
+ and self.ctx.pipeline_config.source
180
+ and self.ctx.pipeline_config.source.config
181
+ and (
182
+ (
183
+ hasattr(
184
+ self.ctx.pipeline_config.source.config,
185
+ "convert_urns_to_lowercase",
186
+ )
187
+ and self.ctx.pipeline_config.source.config.convert_urns_to_lowercase
188
+ )
189
+ or (
190
+ hasattr(self.ctx.pipeline_config.source.config, "get")
191
+ and self.ctx.pipeline_config.source.config.get(
192
+ "convert_urns_to_lowercase"
193
+ )
194
+ )
195
+ )
196
+ ):
197
+ auto_lowercase_dataset_urns = auto_lowercase_urns
198
+
136
199
  return [
137
- *super().get_workunit_processors(),
200
+ auto_lowercase_dataset_urns,
201
+ auto_materialize_referenced_tags_terms,
202
+ partial(
203
+ auto_fix_duplicate_schema_field_paths, platform=self.infer_platform()
204
+ ),
205
+ partial(auto_fix_empty_field_paths, platform=self.infer_platform()),
206
+ partial(auto_workunit_reporter, self.get_report()),
207
+ auto_patch_last_modified,
208
+ EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
138
209
  StaleEntityRemovalHandler.create(
139
210
  self, self.config, self.ctx
140
211
  ).workunit_processor,
141
212
  ]
142
213
 
143
- def _get_datasets(self, catalog: Catalog) -> Iterable[Identifier]:
214
+ def _get_namespaces(self, catalog: Catalog) -> Iterable[Identifier]:
144
215
  namespaces = catalog.list_namespaces()
145
216
  LOGGER.debug(
146
217
  f"Retrieved {len(namespaces)} namespaces, first 10: {namespaces[:10]}"
147
218
  )
148
219
  self.report.report_no_listed_namespaces(len(namespaces))
149
- tables_count = 0
150
220
  for namespace in namespaces:
151
221
  namespace_repr = ".".join(namespace)
152
222
  if not self.config.namespace_pattern.allowed(namespace_repr):
@@ -155,6 +225,14 @@ class IcebergSource(StatefulIngestionSourceBase):
155
225
  )
156
226
  self.report.report_dropped(f"{namespace_repr}.*")
157
227
  continue
228
+ yield namespace
229
+
230
+ def _get_datasets(
231
+ self, catalog: Catalog, namespaces: Iterable[Tuple[Identifier, str]]
232
+ ) -> Iterable[Tuple[Identifier, str]]:
233
+ LOGGER.debug("Starting to retrieve tables")
234
+ tables_count = 0
235
+ for namespace, namespace_urn in namespaces:
158
236
  try:
159
237
  tables = catalog.list_tables(namespace)
160
238
  tables_count += len(tables)
@@ -164,29 +242,34 @@ class IcebergSource(StatefulIngestionSourceBase):
164
242
  self.report.report_listed_tables_for_namespace(
165
243
  ".".join(namespace), len(tables)
166
244
  )
167
- yield from tables
168
- except NoSuchNamespaceError:
169
- self.report.report_warning(
170
- "no-such-namespace",
171
- f"Couldn't list tables for namespace {namespace} due to NoSuchNamespaceError exception",
245
+ yield from [(table, namespace_urn) for table in tables]
246
+ except NoSuchNamespaceError as e:
247
+ self.report.warning(
248
+ title="No such namespace",
249
+ message="Skipping the missing namespace.",
250
+ context=str(namespace),
251
+ exc=e,
172
252
  )
173
- LOGGER.warning(
174
- f"NoSuchNamespaceError exception while trying to get list of tables from namespace {namespace}, skipping it",
253
+ except RESTError as e:
254
+ self.report.warning(
255
+ title="Iceberg REST Server Error",
256
+ message="Iceberg REST Server returned error status when trying to list tables for a namespace, skipping it.",
257
+ context=str(namespace),
258
+ exc=e,
175
259
  )
176
260
  except Exception as e:
177
261
  self.report.report_failure(
178
- "listing-tables-exception",
179
- f"Couldn't list tables for namespace {namespace} due to {e}",
180
- )
181
- LOGGER.exception(
182
- f"Unexpected exception while trying to get list of tables for namespace {namespace}, skipping it"
262
+ title="Error when processing a namespace",
263
+ message="Skipping the namespace due to errors while processing it.",
264
+ context=str(namespace),
265
+ exc=e,
183
266
  )
184
267
 
185
268
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
186
269
  thread_local = threading.local()
187
270
 
188
271
  def _try_processing_dataset(
189
- dataset_path: Tuple[str, ...], dataset_name: str
272
+ dataset_path: Tuple[str, ...], dataset_name: str, namespace_urn: str
190
273
  ) -> Iterable[MetadataWorkUnit]:
191
274
  try:
192
275
  if not hasattr(thread_local, "local_catalog"):
@@ -195,6 +278,12 @@ class IcebergSource(StatefulIngestionSourceBase):
195
278
  )
196
279
  thread_local.local_catalog = self.config.get_catalog()
197
280
 
281
+ if not hasattr(thread_local, "stamping_processor"):
282
+ LOGGER.debug(
283
+ f"Didn't find stamping_processor in thread_local ({thread_local}), initializing new workunit processor"
284
+ )
285
+ thread_local.stamping_processor = AutoSystemMetadata(self.ctx)
286
+
198
287
  with PerfTimer() as timer:
199
288
  table = thread_local.local_catalog.load_table(dataset_path)
200
289
  time_taken = timer.elapsed_seconds()
@@ -202,56 +291,68 @@ class IcebergSource(StatefulIngestionSourceBase):
202
291
  time_taken, dataset_name, table.metadata_location
203
292
  )
204
293
  LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
205
- yield from self._create_iceberg_workunit(dataset_name, table)
206
- except NoSuchPropertyException as e:
207
- self.report.report_warning(
208
- "table-property-missing",
209
- f"Failed to create workunit for {dataset_name}. {e}",
294
+ dataset_urn: str = make_dataset_urn_with_platform_instance(
295
+ self.platform,
296
+ dataset_name,
297
+ self.config.platform_instance,
298
+ self.config.env,
210
299
  )
211
- LOGGER.warning(
212
- f"NoSuchPropertyException while processing table {dataset_path}, skipping it.",
300
+ for aspect in self._create_iceberg_table_aspects(
301
+ dataset_name, table, namespace_urn
302
+ ):
303
+ yield thread_local.stamping_processor.stamp_wu(
304
+ MetadataChangeProposalWrapper(
305
+ entityUrn=dataset_urn, aspect=aspect
306
+ ).as_workunit()
307
+ )
308
+ except NoSuchPropertyException as e:
309
+ self.report.warning(
310
+ title="Unable to process table",
311
+ message="Table was not processed due to expected property missing (table is probably not an iceberg table).",
312
+ context=dataset_name,
313
+ exc=e,
213
314
  )
214
315
  except NoSuchIcebergTableError as e:
215
- self.report.report_warning(
216
- "not-an-iceberg-table",
217
- f"Failed to create workunit for {dataset_name}. {e}",
218
- )
219
- LOGGER.warning(
220
- f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.",
316
+ self.report.warning(
317
+ title="Skipped non-iceberg table",
318
+ message="Table was recognized as non-iceberg and skipped.",
319
+ context=dataset_name,
320
+ exc=e,
221
321
  )
222
322
  except NoSuchTableError as e:
223
- self.report.report_warning(
224
- "no-such-table",
225
- f"Failed to create workunit for {dataset_name}. {e}",
226
- )
227
- LOGGER.warning(
228
- f"NoSuchTableError while processing table {dataset_path}, skipping it.",
323
+ self.report.warning(
324
+ title="Table not found",
325
+ message="Table was returned by the catalog in the list of table but catalog can't find its details, table was skipped.",
326
+ context=dataset_name,
327
+ exc=e,
229
328
  )
230
329
  except FileNotFoundError as e:
231
- self.report.report_warning(
232
- "file-not-found",
233
- f"Encountered FileNotFoundError when trying to read manifest file for {dataset_name}. {e}",
234
- )
235
- LOGGER.warning(
236
- f"FileNotFoundError while processing table {dataset_path}, skipping it."
237
- )
238
- except ServerError as e:
239
- self.report.report_warning(
240
- "iceberg-rest-server-error",
241
- f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}",
330
+ self.report.warning(
331
+ title="Manifest file not found",
332
+ message="Couldn't find manifest file to read for the table, skipping it.",
333
+ context=dataset_name,
334
+ exc=e,
242
335
  )
243
- LOGGER.warning(
244
- f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
336
+ except RESTError as e:
337
+ self.report.warning(
338
+ title="Iceberg REST Server Error",
339
+ message="Iceberg REST Server returned error status when trying to process a table, skipping it.",
340
+ context=dataset_name,
341
+ exc=e,
245
342
  )
246
343
  except ValueError as e:
247
344
  if "Could not initialize FileIO" not in str(e):
248
345
  raise
249
346
  self.report.warning(
250
- "Could not initialize FileIO",
251
- f"Could not initialize FileIO for {dataset_path} due to: {e}",
347
+ title="Could not initialize FileIO",
348
+ message="Could not initialize FileIO for a table (are you using custom FileIO?). Skipping the table.",
349
+ context=dataset_name,
350
+ exc=e,
252
351
  )
253
352
 
254
- def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
353
+ def _process_dataset(
354
+ dataset_path: Identifier, namespace_urn: str
355
+ ) -> Iterable[MetadataWorkUnit]:
255
356
  try:
256
357
  LOGGER.debug(f"Processing dataset for path {dataset_path}")
257
358
  dataset_name = ".".join(dataset_path)
@@ -263,106 +364,153 @@ class IcebergSource(StatefulIngestionSourceBase):
263
364
  )
264
365
  return
265
366
 
266
- yield from _try_processing_dataset(dataset_path, dataset_name)
367
+ yield from _try_processing_dataset(
368
+ dataset_path, dataset_name, namespace_urn
369
+ )
267
370
  except Exception as e:
268
371
  self.report.report_failure(
269
- "general",
270
- f"Failed to create workunit for dataset {dataset_path}: {e}",
271
- )
272
- LOGGER.exception(
273
- f"Exception while processing table {dataset_path}, skipping it.",
372
+ title="Error when processing a table",
373
+ message="Skipping the table due to errors when processing it.",
374
+ context=str(dataset_path),
375
+ exc=e,
274
376
  )
275
377
 
276
378
  try:
277
- catalog = self.config.get_catalog()
379
+ self.catalog = self.config.get_catalog()
380
+ except Exception as e:
381
+ self.report.report_failure(
382
+ title="Failed to initialize catalog object",
383
+ message="Couldn't start the ingestion due to failure to initialize catalog object.",
384
+ exc=e,
385
+ )
386
+ return
387
+
388
+ try:
389
+ yield from self._process_namespaces()
278
390
  except Exception as e:
279
- self.report.report_failure("get-catalog", f"Failed to get catalog: {e}")
391
+ self.report.report_failure(
392
+ title="Failed to list namespaces",
393
+ message="Couldn't start the ingestion due to a failure to process the list of the namespaces",
394
+ exc=e,
395
+ )
280
396
  return
281
397
 
282
398
  for wu in ThreadedIteratorExecutor.process(
283
399
  worker_func=_process_dataset,
284
- args_list=[(dataset_path,) for dataset_path in self._get_datasets(catalog)],
400
+ args_list=[
401
+ (dataset_path, namespace_urn)
402
+ for dataset_path, namespace_urn in self._get_datasets(
403
+ self.catalog, self.namespaces
404
+ )
405
+ ],
285
406
  max_workers=self.config.processing_threads,
286
407
  ):
287
408
  yield wu
288
409
 
289
- def _create_iceberg_workunit(
290
- self, dataset_name: str, table: Table
410
+ def _try_processing_namespace(
411
+ self, namespace: Identifier
291
412
  ) -> Iterable[MetadataWorkUnit]:
413
+ namespace_repr = ".".join(namespace)
414
+ try:
415
+ LOGGER.debug(f"Processing namespace {namespace_repr}")
416
+ namespace_urn = make_container_urn(
417
+ NamespaceKey(
418
+ namespace=namespace_repr,
419
+ platform=self.platform,
420
+ instance=self.config.platform_instance,
421
+ env=self.config.env,
422
+ )
423
+ )
424
+
425
+ namespace_properties: Properties = self.catalog.load_namespace_properties(
426
+ namespace
427
+ )
428
+ for aspect in self._create_iceberg_namespace_aspects(
429
+ namespace, namespace_properties
430
+ ):
431
+ yield self.stamping_processor.stamp_wu(
432
+ MetadataChangeProposalWrapper(
433
+ entityUrn=namespace_urn, aspect=aspect
434
+ ).as_workunit()
435
+ )
436
+ self.namespaces.append((namespace, namespace_urn))
437
+ except NoSuchNamespaceError as e:
438
+ self.report.report_warning(
439
+ title="Failed to retrieve namespace properties",
440
+ message="Couldn't find the namespace, was it deleted during the ingestion?",
441
+ context=namespace_repr,
442
+ exc=e,
443
+ )
444
+ return
445
+ except RESTError as e:
446
+ self.report.warning(
447
+ title="Iceberg REST Server Error",
448
+ message="Iceberg REST Server returned error status when trying to retrieve namespace properties, skipping it.",
449
+ context=str(namespace),
450
+ exc=e,
451
+ )
452
+ except Exception as e:
453
+ self.report.report_failure(
454
+ title="Failed to process namespace",
455
+ message="Unhandled exception happened during processing of the namespace",
456
+ context=namespace_repr,
457
+ exc=e,
458
+ )
459
+
460
+ def _process_namespaces(self) -> Iterable[MetadataWorkUnit]:
461
+ namespace_ids = self._get_namespaces(self.catalog)
462
+ for namespace in namespace_ids:
463
+ yield from self._try_processing_namespace(namespace)
464
+
465
+ LOGGER.debug("Namespaces ingestion completed")
466
+
467
+ def _create_iceberg_table_aspects(
468
+ self, dataset_name: str, table: Table, namespace_urn: str
469
+ ) -> Iterable[_Aspect]:
292
470
  with PerfTimer() as timer:
293
471
  self.report.report_table_scanned(dataset_name)
294
472
  LOGGER.debug(f"Processing table {dataset_name}")
295
- dataset_urn: str = make_dataset_urn_with_platform_instance(
296
- self.platform,
297
- dataset_name,
298
- self.config.platform_instance,
299
- self.config.env,
300
- )
301
- dataset_snapshot = DatasetSnapshot(
302
- urn=dataset_urn,
303
- aspects=[Status(removed=False)],
304
- )
473
+ yield Status(removed=False)
474
+ yield SubTypes(typeNames=[DatasetSubTypes.TABLE])
305
475
 
306
- # Dataset properties aspect.
307
- additional_properties = {}
308
- custom_properties = table.metadata.properties.copy()
309
- custom_properties["location"] = table.metadata.location
310
- custom_properties["format-version"] = str(table.metadata.format_version)
311
- custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
312
- if table.current_snapshot():
313
- custom_properties["snapshot-id"] = str(
314
- table.current_snapshot().snapshot_id
315
- )
316
- custom_properties["manifest-list"] = (
317
- table.current_snapshot().manifest_list
318
- )
319
- additional_properties["lastModified"] = TimeStampClass(
320
- int(table.current_snapshot().timestamp_ms)
321
- )
322
- if "created-at" in custom_properties:
323
- try:
324
- dt = dateutil_parser.isoparse(custom_properties["created-at"])
325
- additional_properties["created"] = TimeStampClass(
326
- int(dt.timestamp() * 1000)
327
- )
328
- except Exception as ex:
329
- LOGGER.warning(
330
- f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
331
- )
476
+ yield self._get_dataset_properties_aspect(dataset_name, table)
332
477
 
333
- dataset_properties = DatasetPropertiesClass(
334
- name=table.name()[-1],
335
- description=table.metadata.properties.get("comment", None),
336
- customProperties=custom_properties,
337
- lastModified=additional_properties.get("lastModified"),
338
- created=additional_properties.get("created"),
339
- qualifiedName=dataset_name,
340
- )
341
- dataset_snapshot.aspects.append(dataset_properties)
342
- # Dataset ownership aspect.
343
478
  dataset_ownership = self._get_ownership_aspect(table)
344
479
  if dataset_ownership:
345
480
  LOGGER.debug(
346
481
  f"Adding ownership: {dataset_ownership} to the dataset {dataset_name}"
347
482
  )
348
- dataset_snapshot.aspects.append(dataset_ownership)
483
+ yield dataset_ownership
349
484
 
350
- schema_metadata = self._create_schema_metadata(dataset_name, table)
351
- dataset_snapshot.aspects.append(schema_metadata)
485
+ yield self._create_schema_metadata(dataset_name, table)
486
+ dpi = self._get_dataplatform_instance_aspect()
487
+ yield dpi
488
+ yield self._create_browse_paths_aspect(dpi.instance, str(namespace_urn))
489
+ yield ContainerClass(container=str(namespace_urn))
352
490
 
353
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
354
491
  self.report.report_table_processing_time(
355
492
  timer.elapsed_seconds(), dataset_name, table.metadata_location
356
493
  )
357
- yield MetadataWorkUnit(id=dataset_name, mce=mce)
358
-
359
- dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
360
- if dpi_aspect:
361
- yield dpi_aspect
362
494
 
363
495
  if self.config.is_profiling_enabled():
364
496
  profiler = IcebergProfiler(self.report, self.config.profiling)
365
- yield from profiler.profile_table(dataset_name, dataset_urn, table)
497
+ yield from profiler.profile_table(dataset_name, table)
498
+
499
+ def _create_browse_paths_aspect(
500
+ self,
501
+ platform_instance_urn: Optional[str] = None,
502
+ container_urn: Optional[str] = None,
503
+ ) -> BrowsePathsV2Class:
504
+ path = []
505
+ if platform_instance_urn:
506
+ path.append(
507
+ BrowsePathEntryClass(
508
+ id=platform_instance_urn, urn=platform_instance_urn
509
+ )
510
+ )
511
+ if container_urn:
512
+ path.append(BrowsePathEntryClass(id=container_urn, urn=container_urn))
513
+ return BrowsePathsV2Class(path=path)
366
514
 
367
515
  def _get_partition_aspect(self, table: Table) -> Optional[str]:
368
516
  """Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
@@ -401,12 +549,48 @@ class IcebergSource(StatefulIngestionSourceBase):
401
549
  ]
402
550
  )
403
551
  except Exception as e:
404
- self.report.report_warning(
405
- "extract-partition",
406
- f"Failed to extract partition spec from Iceberg table {table.name()} due to error: {str(e)}",
552
+ self.report.warning(
553
+ title="Failed to extract partition information",
554
+ message="Failed to extract partition information for a table. Table metadata will be ingested without it.",
555
+ context=str(table.name),
556
+ exc=e,
407
557
  )
408
558
  return None
409
559
 
560
+ def _get_dataset_properties_aspect(
561
+ self, dataset_name: str, table: Table
562
+ ) -> DatasetPropertiesClass:
563
+ created: Optional[TimeStampClass] = None
564
+ custom_properties = table.metadata.properties.copy()
565
+ custom_properties["location"] = table.metadata.location
566
+ custom_properties["format-version"] = str(table.metadata.format_version)
567
+ custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
568
+ last_modified: Optional[int] = table.metadata.last_updated_ms
569
+ if current_snapshot := table.current_snapshot():
570
+ custom_properties["snapshot-id"] = str(current_snapshot.snapshot_id)
571
+ custom_properties["manifest-list"] = current_snapshot.manifest_list
572
+ if not last_modified:
573
+ last_modified = int(current_snapshot.timestamp_ms)
574
+ if "created-at" in custom_properties:
575
+ try:
576
+ dt = dateutil_parser.isoparse(custom_properties["created-at"])
577
+ created = TimeStampClass(int(dt.timestamp() * 1000))
578
+ except Exception as ex:
579
+ LOGGER.warning(
580
+ f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
581
+ )
582
+
583
+ return DatasetPropertiesClass(
584
+ name=table.name()[-1],
585
+ description=table.metadata.properties.get("comment", None),
586
+ customProperties=custom_properties,
587
+ lastModified=TimeStampClass(last_modified)
588
+ if last_modified is not None
589
+ else None,
590
+ created=created,
591
+ qualifiedName=dataset_name,
592
+ )
593
+
410
594
  def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]:
411
595
  owners = []
412
596
  if self.config.user_ownership_property:
@@ -435,22 +619,15 @@ class IcebergSource(StatefulIngestionSourceBase):
435
619
  )
436
620
  return OwnershipClass(owners=owners) if owners else None
437
621
 
438
- def _get_dataplatform_instance_aspect(
439
- self, dataset_urn: str
440
- ) -> Optional[MetadataWorkUnit]:
441
- # If we are a platform instance based source, emit the instance aspect
442
- if self.config.platform_instance:
443
- return MetadataChangeProposalWrapper(
444
- entityUrn=dataset_urn,
445
- aspect=DataPlatformInstanceClass(
446
- platform=make_data_platform_urn(self.platform),
447
- instance=make_dataplatform_instance_urn(
448
- self.platform, self.config.platform_instance
449
- ),
450
- ),
451
- ).as_workunit()
452
-
453
- return None
622
+ def _get_dataplatform_instance_aspect(self) -> DataPlatformInstanceClass:
623
+ return DataPlatformInstanceClass(
624
+ platform=make_data_platform_urn(self.platform),
625
+ instance=make_dataplatform_instance_urn(
626
+ self.platform, self.config.platform_instance
627
+ )
628
+ if self.config.platform_instance
629
+ else None,
630
+ )
454
631
 
455
632
  def _create_schema_metadata(
456
633
  self, dataset_name: str, table: Table
@@ -479,6 +656,30 @@ class IcebergSource(StatefulIngestionSourceBase):
479
656
  def get_report(self) -> SourceReport:
480
657
  return self.report
481
658
 
659
+ def _create_iceberg_namespace_aspects(
660
+ self, namespace: Identifier, properties: Properties
661
+ ) -> Iterable[_Aspect]:
662
+ namespace_repr = ".".join(namespace)
663
+ custom_properties: Dict[str, str] = {}
664
+ for k, v in properties.items():
665
+ try:
666
+ custom_properties[str(k)] = str(v)
667
+ except Exception as e:
668
+ LOGGER.warning(
669
+ f"Exception when trying to parse namespace properties for {namespace_repr}. Exception: {e}"
670
+ )
671
+ yield Status(removed=False)
672
+ yield ContainerProperties(
673
+ name=namespace_repr,
674
+ qualifiedName=namespace_repr,
675
+ env=self.config.env,
676
+ customProperties=custom_properties,
677
+ )
678
+ yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
679
+ dpi = self._get_dataplatform_instance_aspect()
680
+ yield dpi
681
+ yield self._create_browse_paths_aspect(dpi.instance)
682
+
482
683
 
483
684
  class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
484
685
  """Implementation of a visitor to build an Avro schema as a dictionary from an Iceberg schema."""
@@ -635,9 +836,6 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
635
836
  "native_data_type": str(timestamp_type),
636
837
  }
637
838
 
638
- # visit_timestamptz() is required when using pyiceberg >= 0.5.0, which is essentially a duplicate
639
- # of visit_timestampz(). The function has been renamed from visit_timestampz().
640
- # Once Datahub can upgrade its pyiceberg dependency to >=0.5.0, the visit_timestampz() function can be safely removed.
641
839
  def visit_timestamptz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]:
642
840
  # Avro supports 2 types of timestamp:
643
841
  # - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
@@ -654,22 +852,6 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
654
852
  "native_data_type": str(timestamptz_type),
655
853
  }
656
854
 
657
- def visit_timestampz(self, timestamptz_type: TimestamptzType) -> Dict[str, Any]:
658
- # Avro supports 2 types of timestamp:
659
- # - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
660
- # - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
661
- # utcAdjustment: bool = True
662
- return {
663
- "type": "long",
664
- "logicalType": "timestamp-micros",
665
- # Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
666
- # See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
667
- # "logicalType": "timestamp-micros"
668
- # if timestamp_type.adjust_to_utc
669
- # else "local-timestamp-micros",
670
- "native_data_type": str(timestamptz_type),
671
- }
672
-
673
855
  def visit_string(self, string_type: StringType) -> Dict[str, Any]:
674
856
  return {
675
857
  "type": "string",
@@ -688,3 +870,42 @@ class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
688
870
  "type": "bytes",
689
871
  "native_data_type": str(binary_type),
690
872
  }
873
+
874
+ def visit_timestamp_ns(self, timestamp_ns_type: Any) -> Dict[str, Any]:
875
+ # Handle nanosecond precision timestamps
876
+ # Avro supports 2 types of timestamp:
877
+ # - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
878
+ # - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
879
+ return {
880
+ "type": "long",
881
+ "logicalType": "timestamp-micros",
882
+ # Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
883
+ # See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
884
+ # "logicalType": "timestamp-micros"
885
+ # if timestamp_ns_type.adjust_to_utc
886
+ # else "local-timestamp-micros",
887
+ "native_data_type": str(timestamp_ns_type),
888
+ }
889
+
890
+ def visit_timestamptz_ns(self, timestamptz_ns_type: Any) -> Dict[str, Any]:
891
+ # Handle nanosecond precision timestamps with timezone
892
+ # Avro supports 2 types of timestamp:
893
+ # - Timestamp: independent of a particular timezone or calendar (TZ information is lost)
894
+ # - Local Timestamp: represents a timestamp in a local timezone, regardless of what specific time zone is considered local
895
+ return {
896
+ "type": "long",
897
+ "logicalType": "timestamp-micros",
898
+ # Commented out since Avro's Python implementation (1.11.0) does not support local-timestamp-micros, even though it exists in the spec.
899
+ # See bug report: https://issues.apache.org/jira/browse/AVRO-3476 and PR https://github.com/apache/avro/pull/1634
900
+ # "logicalType": "timestamp-micros"
901
+ # if timestamptz_ns_type.adjust_to_utc
902
+ # else "local-timestamp-micros",
903
+ "native_data_type": str(timestamptz_ns_type),
904
+ }
905
+
906
+ def visit_unknown(self, unknown_type: Any) -> Dict[str, Any]:
907
+ # Handle unknown types
908
+ return {
909
+ "type": "string",
910
+ "native_data_type": str(unknown_type),
911
+ }