acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,6 @@ import contextlib
2
2
  import datetime
3
3
  import logging
4
4
  from abc import ABCMeta, abstractmethod
5
- from collections import defaultdict
6
5
  from dataclasses import dataclass, field
7
6
  from enum import Enum
8
7
  from functools import partial
@@ -15,7 +14,6 @@ from typing import (
15
14
  List,
16
15
  Optional,
17
16
  Sequence,
18
- Set,
19
17
  Type,
20
18
  TypeVar,
21
19
  Union,
@@ -27,28 +25,39 @@ from typing_extensions import LiteralString, Self
27
25
 
28
26
  from datahub.configuration.common import ConfigModel
29
27
  from datahub.configuration.source_common import PlatformInstanceConfigMixin
30
- from datahub.emitter.mcp_builder import mcps_from_mce
31
28
  from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
32
29
  auto_patch_last_modified,
33
30
  )
34
31
  from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
35
32
  EnsureAspectSizeProcessor,
36
33
  )
34
+ from datahub.ingestion.api.auto_work_units.auto_validate_input_fields import (
35
+ ValidateInputFieldsProcessor,
36
+ )
37
37
  from datahub.ingestion.api.closeable import Closeable
38
38
  from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
39
- from datahub.ingestion.api.report import Report
39
+ from datahub.ingestion.api.report import ExamplesReport, Report
40
40
  from datahub.ingestion.api.source_helpers import (
41
+ AutoSystemMetadata,
41
42
  auto_browse_path_v2,
42
43
  auto_fix_duplicate_schema_field_paths,
43
44
  auto_fix_empty_field_paths,
44
45
  auto_lowercase_urns,
45
46
  auto_materialize_referenced_tags_terms,
46
47
  auto_status_aspect,
48
+ auto_workunit,
47
49
  auto_workunit_reporter,
48
50
  )
51
+ from datahub.ingestion.api.source_protocols import (
52
+ MetadataWorkUnitIterable,
53
+ ProfilingCapable,
54
+ )
49
55
  from datahub.ingestion.api.workunit import MetadataWorkUnit
50
- from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
51
- from datahub.metadata.schema_classes import UpstreamLineageClass
56
+ from datahub.ingestion.source_report.ingestion_stage import (
57
+ IngestionHighStage,
58
+ IngestionStageReport,
59
+ )
60
+ from datahub.telemetry import stats
52
61
  from datahub.utilities.lossy_collections import LossyDict, LossyList
53
62
  from datahub.utilities.type_annotations import get_class_from_annotation
54
63
 
@@ -72,6 +81,7 @@ class SourceCapability(Enum):
72
81
  SCHEMA_METADATA = "Schema Metadata"
73
82
  CONTAINERS = "Asset Containers"
74
83
  CLASSIFICATION = "Classification"
84
+ TEST_CONNECTION = "Test Connection"
75
85
 
76
86
 
77
87
  class StructuredLogLevel(Enum):
@@ -80,11 +90,24 @@ class StructuredLogLevel(Enum):
80
90
  ERROR = logging.ERROR
81
91
 
82
92
 
93
+ class StructuredLogCategory(Enum):
94
+ """
95
+ This is used to categorise the errors mainly based on the biggest impact area
96
+ This is to be used to help in self-serve understand the impact of any log entry
97
+ More enums to be added as logs are updated to be self-serve
98
+ """
99
+
100
+ LINEAGE = "LINEAGE"
101
+ USAGE = "USAGE"
102
+ PROFILING = "PROFILING"
103
+
104
+
83
105
  @dataclass
84
106
  class StructuredLogEntry(Report):
85
107
  title: Optional[str]
86
108
  message: str
87
109
  context: LossyList[str]
110
+ log_category: Optional[StructuredLogCategory] = None
88
111
 
89
112
 
90
113
  @dataclass
@@ -107,9 +130,10 @@ class StructuredLogs(Report):
107
130
  exc: Optional[BaseException] = None,
108
131
  log: bool = False,
109
132
  stacklevel: int = 1,
133
+ log_category: Optional[StructuredLogCategory] = None,
110
134
  ) -> None:
111
135
  """
112
- Report a user-facing warning for the ingestion run.
136
+ Report a user-facing log for the ingestion run.
113
137
 
114
138
  Args:
115
139
  level: The level of the log entry.
@@ -117,6 +141,9 @@ class StructuredLogs(Report):
117
141
  title: The category / heading to present on for this message in the UI.
118
142
  context: Additional context (e.g. where, how) for the log entry.
119
143
  exc: The exception associated with the event. We'll show the stack trace when in debug mode.
144
+ log_category: The type of the log entry. This is used to categorise the log entry.
145
+ log: Whether to log the entry to the console.
146
+ stacklevel: The stack level to use for the log entry.
120
147
  """
121
148
 
122
149
  # One for this method, and one for the containing report_* call.
@@ -159,6 +186,7 @@ class StructuredLogs(Report):
159
186
  title=title,
160
187
  message=message,
161
188
  context=context_list,
189
+ log_category=log_category,
162
190
  )
163
191
  else:
164
192
  if context is not None:
@@ -186,19 +214,11 @@ class StructuredLogs(Report):
186
214
 
187
215
 
188
216
  @dataclass
189
- class SourceReport(Report):
217
+ class SourceReport(ExamplesReport, IngestionStageReport):
190
218
  event_not_produced_warn: bool = True
191
219
  events_produced: int = 0
192
220
  events_produced_per_sec: int = 0
193
-
194
- _urns_seen: Set[str] = field(default_factory=set)
195
- entities: Dict[str, list] = field(default_factory=lambda: defaultdict(LossyList))
196
- aspects: Dict[str, Dict[str, int]] = field(
197
- default_factory=lambda: defaultdict(lambda: defaultdict(int))
198
- )
199
- aspect_urn_samples: Dict[str, Dict[str, LossyList[str]]] = field(
200
- default_factory=lambda: defaultdict(lambda: defaultdict(LossyList))
201
- )
221
+ num_input_fields_filtered: int = 0
202
222
 
203
223
  _structured_logs: StructuredLogs = field(default_factory=StructuredLogs)
204
224
 
@@ -216,33 +236,10 @@ class SourceReport(Report):
216
236
 
217
237
  def report_workunit(self, wu: WorkUnit) -> None:
218
238
  self.events_produced += 1
239
+ if not isinstance(wu, MetadataWorkUnit):
240
+ return
219
241
 
220
- if isinstance(wu, MetadataWorkUnit):
221
- urn = wu.get_urn()
222
-
223
- # Specialized entity reporting.
224
- if not isinstance(wu.metadata, MetadataChangeEvent):
225
- mcps = [wu.metadata]
226
- else:
227
- mcps = list(mcps_from_mce(wu.metadata))
228
-
229
- for mcp in mcps:
230
- entityType = mcp.entityType
231
- aspectName = mcp.aspectName
232
-
233
- if urn not in self._urns_seen:
234
- self._urns_seen.add(urn)
235
- self.entities[entityType].append(urn)
236
-
237
- if aspectName is not None: # usually true
238
- self.aspects[entityType][aspectName] += 1
239
- self.aspect_urn_samples[entityType][aspectName].append(urn)
240
- if isinstance(mcp.aspect, UpstreamLineageClass):
241
- upstream_lineage = cast(UpstreamLineageClass, mcp.aspect)
242
- if upstream_lineage.fineGrainedLineages:
243
- self.aspect_urn_samples[entityType][
244
- "fineGrainedLineages"
245
- ].append(urn)
242
+ super()._store_workunit_data(wu)
246
243
 
247
244
  def report_warning(
248
245
  self,
@@ -250,9 +247,19 @@ class SourceReport(Report):
250
247
  context: Optional[str] = None,
251
248
  title: Optional[LiteralString] = None,
252
249
  exc: Optional[BaseException] = None,
250
+ log_category: Optional[StructuredLogCategory] = None,
253
251
  ) -> None:
252
+ """
253
+ See docs of StructuredLogs.report_log for details of args
254
+ """
254
255
  self._structured_logs.report_log(
255
- StructuredLogLevel.WARN, message, title, context, exc, log=False
256
+ StructuredLogLevel.WARN,
257
+ message,
258
+ title,
259
+ context,
260
+ exc,
261
+ log=False,
262
+ log_category=log_category,
256
263
  )
257
264
 
258
265
  def warning(
@@ -261,9 +268,20 @@ class SourceReport(Report):
261
268
  context: Optional[str] = None,
262
269
  title: Optional[LiteralString] = None,
263
270
  exc: Optional[BaseException] = None,
271
+ log: bool = True,
272
+ log_category: Optional[StructuredLogCategory] = None,
264
273
  ) -> None:
274
+ """
275
+ See docs of StructuredLogs.report_log for details of args
276
+ """
265
277
  self._structured_logs.report_log(
266
- StructuredLogLevel.WARN, message, title, context, exc, log=True
278
+ StructuredLogLevel.WARN,
279
+ message,
280
+ title,
281
+ context,
282
+ exc,
283
+ log=log,
284
+ log_category=log_category,
267
285
  )
268
286
 
269
287
  def report_failure(
@@ -273,9 +291,19 @@ class SourceReport(Report):
273
291
  title: Optional[LiteralString] = None,
274
292
  exc: Optional[BaseException] = None,
275
293
  log: bool = True,
294
+ log_category: Optional[StructuredLogCategory] = None,
276
295
  ) -> None:
296
+ """
297
+ See docs of StructuredLogs.report_log for details of args
298
+ """
277
299
  self._structured_logs.report_log(
278
- StructuredLogLevel.ERROR, message, title, context, exc, log=log
300
+ StructuredLogLevel.ERROR,
301
+ message,
302
+ title,
303
+ context,
304
+ exc,
305
+ log=log,
306
+ log_category=log_category,
279
307
  )
280
308
 
281
309
  def failure(
@@ -285,9 +313,19 @@ class SourceReport(Report):
285
313
  title: Optional[LiteralString] = None,
286
314
  exc: Optional[BaseException] = None,
287
315
  log: bool = True,
316
+ log_category: Optional[StructuredLogCategory] = None,
288
317
  ) -> None:
318
+ """
319
+ See docs of StructuredLogs.report_log for details of args
320
+ """
289
321
  self._structured_logs.report_log(
290
- StructuredLogLevel.ERROR, message, title, context, exc, log=log
322
+ StructuredLogLevel.ERROR,
323
+ message,
324
+ title,
325
+ context,
326
+ exc,
327
+ log=log,
328
+ log_category=log_category,
291
329
  )
292
330
 
293
331
  def info(
@@ -297,9 +335,19 @@ class SourceReport(Report):
297
335
  title: Optional[LiteralString] = None,
298
336
  exc: Optional[BaseException] = None,
299
337
  log: bool = True,
338
+ log_category: Optional[StructuredLogCategory] = None,
300
339
  ) -> None:
340
+ """
341
+ See docs of StructuredLogs.report_log for details of args
342
+ """
301
343
  self._structured_logs.report_log(
302
- StructuredLogLevel.INFO, message, title, context, exc, log=log
344
+ StructuredLogLevel.INFO,
345
+ message,
346
+ title,
347
+ context,
348
+ exc,
349
+ log=log,
350
+ log_category=log_category,
303
351
  )
304
352
 
305
353
  @contextlib.contextmanager
@@ -309,6 +357,7 @@ class SourceReport(Report):
309
357
  title: Optional[LiteralString] = None,
310
358
  context: Optional[str] = None,
311
359
  level: StructuredLogLevel = StructuredLogLevel.ERROR,
360
+ log_category: Optional[StructuredLogCategory] = None,
312
361
  ) -> Iterator[None]:
313
362
  # Convenience method that helps avoid boilerplate try/except blocks.
314
363
  # TODO: I'm not super happy with the naming here - it's not obvious that this
@@ -317,10 +366,16 @@ class SourceReport(Report):
317
366
  yield
318
367
  except Exception as exc:
319
368
  self._structured_logs.report_log(
320
- level, message=message, title=title, context=context, exc=exc
369
+ level,
370
+ message=message,
371
+ title=title,
372
+ context=context,
373
+ exc=exc,
374
+ log_category=log_category,
321
375
  )
322
376
 
323
377
  def __post_init__(self) -> None:
378
+ super().__post_init__()
324
379
  self.start_time = datetime.datetime.now()
325
380
  self.running_time: datetime.timedelta = datetime.timedelta(seconds=0)
326
381
 
@@ -333,6 +388,43 @@ class SourceReport(Report):
333
388
  "infos": Report.to_pure_python_obj(self.infos),
334
389
  }
335
390
 
391
+ @staticmethod
392
+ def _discretize_dict_values(
393
+ nested_dict: Dict[str, Dict[str, int]],
394
+ ) -> Dict[str, Dict[str, int]]:
395
+ """Helper method to discretize values in a nested dictionary structure."""
396
+ result = {}
397
+ for outer_key, inner_dict in nested_dict.items():
398
+ discretized_dict: Dict[str, int] = {}
399
+ for inner_key, count in inner_dict.items():
400
+ discretized_dict[inner_key] = stats.discretize(count)
401
+ result[outer_key] = discretized_dict
402
+ return result
403
+
404
+ def get_aspects_dict(self) -> Dict[str, Dict[str, int]]:
405
+ """Convert the nested defaultdict aspects to a regular dict for serialization."""
406
+ return self._discretize_dict_values(self.aspects)
407
+
408
+ def get_aspects_by_subtypes_dict(self) -> Dict[str, Dict[str, Dict[str, int]]]:
409
+ """Get aspect counts grouped by entity type and subtype."""
410
+ return self._discretize_dict_values_nested(self.aspects_by_subtypes)
411
+
412
+ @staticmethod
413
+ def _discretize_dict_values_nested(
414
+ nested_dict: Dict[str, Dict[str, Dict[str, int]]],
415
+ ) -> Dict[str, Dict[str, Dict[str, int]]]:
416
+ """Helper method to discretize values in a nested dictionary structure with three levels."""
417
+ result = {}
418
+ for outer_key, middle_dict in nested_dict.items():
419
+ discretized_middle_dict: Dict[str, Dict[str, int]] = {}
420
+ for middle_key, inner_dict in middle_dict.items():
421
+ discretized_inner_dict: Dict[str, int] = {}
422
+ for inner_key, count in inner_dict.items():
423
+ discretized_inner_dict[inner_key] = stats.discretize(count)
424
+ discretized_middle_dict[middle_key] = discretized_inner_dict
425
+ result[outer_key] = discretized_middle_dict
426
+ return result
427
+
336
428
  def compute_stats(self) -> None:
337
429
  super().compute_stats()
338
430
 
@@ -416,12 +508,9 @@ class Source(Closeable, metaclass=ABCMeta):
416
508
  Run in order, first in list is applied first. Be careful with order when overriding.
417
509
  """
418
510
  browse_path_processor: Optional[MetadataWorkUnitProcessor] = None
419
- if (
420
- self.ctx.pipeline_config
421
- and self.ctx.pipeline_config.flags.generate_browse_path_v2
422
- ):
511
+ if self.ctx.flags.generate_browse_path_v2:
423
512
  browse_path_processor = self._get_browse_path_processor(
424
- self.ctx.pipeline_config.flags.generate_browse_path_v2_dry_run
513
+ self.ctx.flags.generate_browse_path_v2_dry_run
425
514
  )
426
515
 
427
516
  auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None
@@ -452,12 +541,13 @@ class Source(Closeable, metaclass=ABCMeta):
452
541
  auto_status_aspect,
453
542
  auto_materialize_referenced_tags_terms,
454
543
  partial(
455
- auto_fix_duplicate_schema_field_paths, platform=self._infer_platform()
544
+ auto_fix_duplicate_schema_field_paths, platform=self.infer_platform()
456
545
  ),
457
- partial(auto_fix_empty_field_paths, platform=self._infer_platform()),
546
+ partial(auto_fix_empty_field_paths, platform=self.infer_platform()),
458
547
  browse_path_processor,
459
548
  partial(auto_workunit_reporter, self.get_report()),
460
549
  auto_patch_last_modified,
550
+ ValidateInputFieldsProcessor(self.get_report()).validate_input_fields,
461
551
  EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
462
552
  ]
463
553
 
@@ -472,11 +562,33 @@ class Source(Closeable, metaclass=ABCMeta):
472
562
  return stream
473
563
 
474
564
  def get_workunits(self) -> Iterable[MetadataWorkUnit]:
475
- return self._apply_workunit_processors(
476
- self.get_workunit_processors(), self.get_workunits_internal()
565
+ workunit_processors = self.get_workunit_processors()
566
+ workunit_processors.append(AutoSystemMetadata(self.ctx).stamp)
567
+ # Process main workunits
568
+ yield from self._apply_workunit_processors(
569
+ workunit_processors, auto_workunit(self.get_workunits_internal())
477
570
  )
571
+ # Process profiling workunits
572
+ yield from self._process_profiling_stage(workunit_processors)
573
+
574
+ def _process_profiling_stage(
575
+ self, processors: List[Optional[MetadataWorkUnitProcessor]]
576
+ ) -> Iterable[MetadataWorkUnit]:
577
+ """Process profiling stage if source supports it."""
578
+ if (
579
+ not isinstance(self, ProfilingCapable)
580
+ or not self.is_profiling_enabled_internal()
581
+ ):
582
+ return
583
+ with self.get_report().new_high_stage(IngestionHighStage.PROFILING):
584
+ profiling_stream = self._apply_workunit_processors(
585
+ processors, auto_workunit(self.get_profiling_internal())
586
+ )
587
+ yield from profiling_stream
478
588
 
479
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
589
+ def get_workunits_internal(
590
+ self,
591
+ ) -> MetadataWorkUnitIterable:
480
592
  raise NotImplementedError(
481
593
  "get_workunits_internal must be implemented if get_workunits is not overriden."
482
594
  )
@@ -498,9 +610,9 @@ class Source(Closeable, metaclass=ABCMeta):
498
610
  pass
499
611
 
500
612
  def close(self) -> None:
501
- pass
613
+ self.get_report().close()
502
614
 
503
- def _infer_platform(self) -> Optional[str]:
615
+ def infer_platform(self) -> Optional[str]:
504
616
  config = self.get_config()
505
617
  platform = (
506
618
  getattr(config, "platform_name", None)
@@ -515,7 +627,7 @@ class Source(Closeable, metaclass=ABCMeta):
515
627
  def _get_browse_path_processor(self, dry_run: bool) -> MetadataWorkUnitProcessor:
516
628
  config = self.get_config()
517
629
 
518
- platform = self._infer_platform()
630
+ platform = self.infer_platform()
519
631
  env = getattr(config, "env", None)
520
632
  browse_path_drop_dirs = [
521
633
  platform,
@@ -13,9 +13,14 @@ from typing import (
13
13
  )
14
14
 
15
15
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
16
- from datahub.emitter.mce_builder import make_dataplatform_instance_urn, parse_ts_millis
16
+ from datahub.emitter.mce_builder import (
17
+ get_sys_time,
18
+ make_dataplatform_instance_urn,
19
+ parse_ts_millis,
20
+ )
17
21
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
18
22
  from datahub.emitter.mcp_builder import entity_supports_aspect
23
+ from datahub.ingestion.api.common import PipelineContext
19
24
  from datahub.ingestion.api.workunit import MetadataWorkUnit
20
25
  from datahub.metadata.schema_classes import (
21
26
  BrowsePathEntryClass,
@@ -35,6 +40,7 @@ from datahub.metadata.schema_classes import (
35
40
  TimeWindowSizeClass,
36
41
  )
37
42
  from datahub.metadata.urns import DatasetUrn, GlossaryTermUrn, TagUrn, Urn
43
+ from datahub.sdk.entity import Entity
38
44
  from datahub.specific.dataset import DatasetPatchBuilder
39
45
  from datahub.telemetry import telemetry
40
46
  from datahub.utilities.urns.error import InvalidUrnError
@@ -48,7 +54,14 @@ logger = logging.getLogger(__name__)
48
54
 
49
55
 
50
56
  def auto_workunit(
51
- stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]],
57
+ stream: Iterable[
58
+ Union[
59
+ MetadataChangeEventClass,
60
+ MetadataChangeProposalWrapper,
61
+ MetadataWorkUnit,
62
+ Entity,
63
+ ]
64
+ ],
52
65
  ) -> Iterable[MetadataWorkUnit]:
53
66
  """Convert a stream of MCEs and MCPs to a stream of :class:`MetadataWorkUnit`s."""
54
67
 
@@ -58,8 +71,12 @@ def auto_workunit(
58
71
  id=MetadataWorkUnit.generate_workunit_id(item),
59
72
  mce=item,
60
73
  )
61
- else:
74
+ elif isinstance(item, MetadataChangeProposalWrapper):
62
75
  yield item.as_workunit()
76
+ elif isinstance(item, Entity):
77
+ yield from item.as_workunits()
78
+ else:
79
+ yield item
63
80
 
64
81
 
65
82
  def create_dataset_props_patch_builder(
@@ -75,6 +92,7 @@ def create_dataset_props_patch_builder(
75
92
  patch_builder.set_last_modified(dataset_properties.lastModified)
76
93
  patch_builder.set_qualified_name(dataset_properties.qualifiedName)
77
94
  patch_builder.add_custom_properties(dataset_properties.customProperties)
95
+ patch_builder.set_external_url(dataset_properties.externalUrl)
78
96
 
79
97
  return patch_builder
80
98
 
@@ -532,3 +550,23 @@ def _prepend_platform_instance(
532
550
  return [BrowsePathEntryClass(id=urn, urn=urn)] + entries
533
551
 
534
552
  return entries
553
+
554
+
555
+ class AutoSystemMetadata:
556
+ def __init__(self, ctx: PipelineContext):
557
+ self.ctx = ctx
558
+
559
+ def stamp(self, stream: Iterable[MetadataWorkUnit]) -> Iterable[MetadataWorkUnit]:
560
+ for wu in stream:
561
+ yield self.stamp_wu(wu)
562
+
563
+ def stamp_wu(self, wu: MetadataWorkUnit) -> MetadataWorkUnit:
564
+ if self.ctx.flags.set_system_metadata:
565
+ if not wu.metadata.systemMetadata:
566
+ wu.metadata.systemMetadata = SystemMetadataClass()
567
+ wu.metadata.systemMetadata.runId = self.ctx.run_id
568
+ if not wu.metadata.systemMetadata.lastObserved:
569
+ wu.metadata.systemMetadata.lastObserved = get_sys_time()
570
+ if self.ctx.flags.set_system_metadata_pipeline_name:
571
+ wu.metadata.systemMetadata.pipelineName = self.ctx.pipeline_name
572
+ return wu
@@ -0,0 +1,23 @@
1
+ from typing import Iterable, Protocol, Union, runtime_checkable
2
+
3
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
4
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
5
+ from datahub.sdk.entity import Entity
6
+
7
+ # Type alias for metadata work units - Python 3.9 compatible
8
+ MetadataWorkUnitIterable = Iterable[
9
+ Union[MetadataWorkUnit, MetadataChangeProposalWrapper, Entity]
10
+ ]
11
+
12
+
13
+ @runtime_checkable
14
+ class ProfilingCapable(Protocol):
15
+ """Protocol for sources that support profiling functionality."""
16
+
17
+ def is_profiling_enabled_internal(self) -> bool:
18
+ """Check if profiling is enabled for this source."""
19
+ ...
20
+
21
+ def get_profiling_internal(self) -> MetadataWorkUnitIterable:
22
+ """Generate profiling work units."""
23
+ ...
File without changes