acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,724 @@
1
+ import logging
2
+ import threading
3
+ import typing
4
+ from abc import ABC, abstractmethod
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+ from typing import (
8
+ Generic,
9
+ Iterable,
10
+ List,
11
+ Optional,
12
+ Protocol,
13
+ Type,
14
+ TypeVar,
15
+ Union,
16
+ cast,
17
+ )
18
+
19
+ import cachetools
20
+ from pydantic import BaseModel
21
+ from typing_extensions import get_original_bases
22
+
23
+ from datahub.api.entities.platformresource.platform_resource import (
24
+ PlatformResource,
25
+ PlatformResourceKey,
26
+ PlatformResourceSearchFields,
27
+ )
28
+ from datahub.ingestion.api.report import SupportsAsObj
29
+ from datahub.ingestion.graph.client import DataHubGraph
30
+ from datahub.metadata.urns import PlatformResourceUrn, Urn
31
+ from datahub.utilities.search_utils import ElasticDocumentQuery
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # Type variables for generic repository
36
+ TExternalEntityId = TypeVar("TExternalEntityId", bound="ExternalEntityId")
37
+ TExternalEntity = TypeVar("TExternalEntity", bound="ExternalEntity")
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class UrnCacheKey:
42
+ """Typed compound key for URN search cache.
43
+
44
+ This eliminates fragile string parsing and provides type safety for cache operations.
45
+ Using dataclass with frozen=True makes it immutable and hashable.
46
+ """
47
+
48
+ urn: str
49
+ platform_instance: Optional[str]
50
+
51
+ def __str__(self) -> str:
52
+ """String representation for debugging purposes only."""
53
+ return (
54
+ f"UrnCacheKey(urn={self.urn}, platform_instance={self.platform_instance})"
55
+ )
56
+
57
+
58
+ class SyncContext(Protocol):
59
+ """Protocol defining the interface for platform-specific sync context objects.
60
+
61
+ All sync context objects must have a platform_instance attribute that can be None.
62
+ """
63
+
64
+ platform_instance: Optional[str]
65
+
66
+
67
+ class PlatformResourceRepository(
68
+ SupportsAsObj, ABC, Generic[TExternalEntityId, TExternalEntity]
69
+ ):
70
+ CACHE_SIZE = 1000
71
+
72
+ # Subclasses should override this with their specific entity class
73
+ entity_class: Type[TExternalEntity]
74
+
75
+ def __init__(self, graph: DataHubGraph, platform_instance: Optional[str] = None):
76
+ self.graph = graph
77
+ self.platform_instance = platform_instance
78
+
79
+ # Extract the entity class from generic type parameters
80
+ # self.entity_class = typing.get_args(self.__class__.__orig_bases__[0])[1]
81
+ self.entity_class = typing.get_args(get_original_bases(self.__class__)[0])[1]
82
+
83
+ # Two-tier cache architecture for efficient external entity management
84
+ # URN search cache: maps UrnCacheKey -> ExternalEntityId
85
+ self.urn_search_cache: cachetools.LRUCache[
86
+ UrnCacheKey, Optional[TExternalEntityId]
87
+ ] = cachetools.LRUCache(maxsize=PlatformResourceRepository.CACHE_SIZE)
88
+ # External entity cache: maps platform_resource_key.id -> ExternalEntity
89
+ self.external_entity_cache: cachetools.LRUCache[str, TExternalEntity] = (
90
+ cachetools.LRUCache(maxsize=PlatformResourceRepository.CACHE_SIZE)
91
+ )
92
+
93
+ # Statistics tracking - simple integers following DataHub report patterns
94
+ self.urn_search_cache_hits = 0
95
+ self.urn_search_cache_misses = 0
96
+ self.external_entity_cache_hits = 0
97
+ self.external_entity_cache_misses = 0
98
+
99
+ # Error tracking for cache operations
100
+ self.cache_update_errors = 0
101
+ self.cache_invalidation_errors = 0
102
+ self.entity_creation_errors = 0
103
+ self.cache_key_parsing_errors = 0
104
+
105
+ # Thread safety infrastructure
106
+ # Use RLock to allow recursive acquisition within the same thread
107
+ self._cache_lock = threading.RLock()
108
+
109
+ def search_by_filter(
110
+ self, query: ElasticDocumentQuery, add_to_cache: bool = True
111
+ ) -> Iterable[PlatformResource]:
112
+ results = PlatformResource.search_by_filters(self.graph, query)
113
+ # Note: add_to_cache parameter is kept for API compatibility but ignored
114
+ # since we no longer cache raw PlatformResource objects
115
+ for platform_resource in results:
116
+ yield platform_resource
117
+
118
+ def create(self, platform_resource: PlatformResource) -> None:
119
+ """Create platform resource in DataHub with atomic cache operations.
120
+
121
+ This method ensures thread-safe, atomic updates across both caches.
122
+ """
123
+ # First, perform the DataHub ingestion outside the cache lock
124
+ platform_resource.to_datahub(self.graph)
125
+
126
+ # Now perform atomic cache operations
127
+ with self._cache_lock:
128
+ # Cache the transformed entity with correct flags after ingestion and update related caches
129
+ if (
130
+ platform_resource.resource_info
131
+ and platform_resource.resource_info.value
132
+ ):
133
+ try:
134
+ # Extract the original entity from the serialized resource value
135
+ entity_obj = (
136
+ platform_resource.resource_info.value.as_pydantic_object(
137
+ self.entity_class
138
+ )
139
+ )
140
+ entity = self.entity_class(**entity_obj.dict())
141
+
142
+ # Create updated entity ID with persisted=True
143
+ entity_id = entity.get_id()
144
+ if hasattr(entity_id, "dict"):
145
+ entity_id_data = entity_id.dict()
146
+ entity_id_data["persisted"] = True
147
+
148
+ # Create new entity ID with updated flags
149
+ updated_entity_id = type(entity_id)(**entity_id_data)
150
+
151
+ # Update the entity with the new ID (immutable update)
152
+ entity_data = entity.dict() # type: ignore[attr-defined]
153
+ entity_data["id"] = updated_entity_id
154
+ updated_entity = type(entity)(**entity_data)
155
+
156
+ # Cache the updated entity in the external entity cache
157
+ # Use the same cache key that get_entity_from_datahub uses
158
+ updated_platform_resource_key = (
159
+ updated_entity_id.to_platform_resource_key()
160
+ )
161
+ self.external_entity_cache[updated_platform_resource_key.id] = (
162
+ updated_entity
163
+ )
164
+
165
+ # Update URN search cache for any URNs associated with this entity
166
+ # This ensures that future URN searches will find the newly created entity
167
+ if (
168
+ platform_resource.resource_info
169
+ and platform_resource.resource_info.secondary_keys
170
+ ):
171
+ for (
172
+ secondary_key
173
+ ) in platform_resource.resource_info.secondary_keys:
174
+ # Create typed compound cache key
175
+ urn_cache_key = UrnCacheKey(
176
+ urn=secondary_key,
177
+ platform_instance=self.platform_instance,
178
+ )
179
+ # Cache the updated entity ID so URN searches will find it
180
+ self.urn_search_cache[urn_cache_key] = cast(
181
+ TExternalEntityId, updated_entity_id
182
+ )
183
+
184
+ # Also check if there are any None cache entries that should be invalidated
185
+ # Look for cache entries that were previously searched but not found
186
+ stale_cache_keys = []
187
+ if (
188
+ platform_resource.resource_info
189
+ and platform_resource.resource_info.secondary_keys
190
+ ):
191
+ for cache_key, cached_value in list(
192
+ self.urn_search_cache.items()
193
+ ):
194
+ if cached_value is None:
195
+ # Direct attribute access on typed key - no parsing needed!
196
+ try:
197
+ # Check if this cache key refers to this entity
198
+ if (
199
+ cache_key.platform_instance
200
+ == self.platform_instance
201
+ and cache_key.urn
202
+ in platform_resource.resource_info.secondary_keys
203
+ ):
204
+ stale_cache_keys.append(cache_key)
205
+ except Exception as cache_key_error:
206
+ # Track cache key processing errors and log them
207
+ self.cache_key_parsing_errors += 1
208
+ logger.warning(
209
+ f"Failed to process cache key '{cache_key}' during stale cache invalidation: {cache_key_error}"
210
+ )
211
+ continue
212
+
213
+ # Remove stale None cache entries and replace with the actual entity ID
214
+ for stale_key in stale_cache_keys:
215
+ try:
216
+ del self.urn_search_cache[stale_key]
217
+ self.urn_search_cache[stale_key] = cast(
218
+ TExternalEntityId, updated_entity_id
219
+ )
220
+ except Exception as invalidation_error:
221
+ # Track cache invalidation errors
222
+ self.cache_invalidation_errors += 1
223
+ logger.warning(
224
+ f"Failed to invalidate stale cache entry '{stale_key}': {invalidation_error}"
225
+ )
226
+
227
+ except Exception as cache_error:
228
+ # Track cache update errors and log them
229
+ self.cache_update_errors += 1
230
+ logger.error(
231
+ f"Failed to update caches after entity creation for resource {platform_resource.id}: {cache_error}"
232
+ )
233
+
234
+ def get(self, key: PlatformResourceKey) -> Optional[PlatformResource]:
235
+ """Retrieve platform resource by performing a direct DataHub query.
236
+
237
+ Note: This method no longer uses caching since we eliminated the
238
+ platform_resource_cache in favor of the more useful external_entity_cache.
239
+ """
240
+ # Query DataHub directly for the platform resource
241
+ platform_resources = list(
242
+ self.search_by_filter(
243
+ ElasticDocumentQuery.create_from(
244
+ (
245
+ PlatformResourceSearchFields.RESOURCE_TYPE,
246
+ self.get_resource_type(),
247
+ ),
248
+ (PlatformResourceSearchFields.PRIMARY_KEY, key.primary_key),
249
+ ),
250
+ add_to_cache=False,
251
+ )
252
+ )
253
+
254
+ # Find matching resource by ID
255
+ for platform_resource in platform_resources:
256
+ if platform_resource.id == key.id:
257
+ return platform_resource
258
+ return None
259
+
260
+ def delete(self, key: PlatformResourceKey) -> None:
261
+ """Thread-safe atomic deletion from DataHub and all caches."""
262
+ # First, perform the DataHub deletion outside the cache lock
263
+ self.graph.delete_entity(urn=PlatformResourceUrn(key.id).urn(), hard=True)
264
+
265
+ # Now perform atomic cache cleanup
266
+ with self._cache_lock:
267
+ # Clear external entity cache
268
+ if key.id in self.external_entity_cache:
269
+ del self.external_entity_cache[key.id]
270
+
271
+ # Note: We intentionally do not clear URN search cache entries here
272
+ # The URN cache will naturally expire via LRU eviction, and clearing
273
+ # stale entries would require expensive O(n) iteration over all cache keys.
274
+ # Stale cache entries pointing to deleted entities will return None on
275
+ # subsequent lookups, which is the correct behavior.
276
+
277
+ def get_resource_type(self) -> str:
278
+ """Get the platform-specific resource type for filtering.
279
+
280
+ Returns the entity class name, which matches the resource type.
281
+
282
+ Returns:
283
+ Resource type string (e.g., 'UnityCatalogTagPlatformResource')
284
+ """
285
+ return self.entity_class.__name__
286
+
287
+ def create_default_entity(
288
+ self, entity_id: TExternalEntityId, managed_by_datahub: bool
289
+ ) -> TExternalEntity:
290
+ """Create a default entity when none found in DataHub.
291
+
292
+ This method delegates to the entity class's create_default class method
293
+ to avoid circular dependencies and ensure entity creation logic stays
294
+ with the entity class.
295
+
296
+ Args:
297
+ entity_id: The external entity ID
298
+ managed_by_datahub: Whether the entity is managed by DataHub
299
+
300
+ Returns:
301
+ Default entity instance
302
+ """
303
+ # Call the abstract create_default method on the entity class
304
+ return cast(
305
+ TExternalEntity,
306
+ self.entity_class.create_default(entity_id, managed_by_datahub),
307
+ )
308
+
309
+ def search_entity_by_urn(self, urn: str) -> Optional[TExternalEntityId]:
310
+ """Search for existing external entity by URN with thread-safe caching.
311
+
312
+ Args:
313
+ urn: The URN to search for
314
+
315
+ Returns:
316
+ External entity ID if found, None otherwise
317
+ """
318
+ # Create typed compound cache key
319
+ cache_key = UrnCacheKey(urn=urn, platform_instance=self.platform_instance)
320
+
321
+ # Thread-safe cache check
322
+ with self._cache_lock:
323
+ if cache_key in self.urn_search_cache:
324
+ cached_result = self.urn_search_cache[cache_key]
325
+ # Update statistics within cache lock following DataHub patterns
326
+ self.urn_search_cache_hits += 1
327
+ logger.debug(f"Cache hit for URN search: {cache_key}")
328
+ return cached_result
329
+
330
+ self.urn_search_cache_misses += 1
331
+
332
+ logger.debug(
333
+ f"Cache miss for URN {urn} with platform instance {self.platform_instance}"
334
+ )
335
+
336
+ mapped_entities = [
337
+ t
338
+ for t in self.search_by_filter(
339
+ ElasticDocumentQuery.create_from(
340
+ (
341
+ PlatformResourceSearchFields.RESOURCE_TYPE,
342
+ self.get_resource_type(),
343
+ ),
344
+ (PlatformResourceSearchFields.SECONDARY_KEYS, urn),
345
+ ),
346
+ add_to_cache=False, # We'll cache the result ourselves
347
+ )
348
+ ]
349
+
350
+ result = None
351
+ if len(mapped_entities) > 0:
352
+ for platform_resource in mapped_entities:
353
+ if (
354
+ platform_resource.resource_info
355
+ and platform_resource.resource_info.value
356
+ ):
357
+ entity_obj = (
358
+ platform_resource.resource_info.value.as_pydantic_object(
359
+ self.entity_class
360
+ )
361
+ )
362
+ entity = self.entity_class(**entity_obj.dict())
363
+ # Check if platform instance matches
364
+ entity_id = entity.get_id()
365
+ if entity_id.platform_instance == self.platform_instance:
366
+ # Create a new entity ID with the correct state instead of mutating
367
+ # All our entity IDs are Pydantic models, so we can use dict() method
368
+ entity_data = entity_id.dict()
369
+ entity_data["persisted"] = (
370
+ True # This entity was found in DataHub
371
+ )
372
+ result = cast(TExternalEntityId, type(entity_id)(**entity_data))
373
+ break
374
+
375
+ # Thread-safe cache update of the result (even if None)
376
+ with self._cache_lock:
377
+ self.urn_search_cache[cache_key] = result
378
+ return result
379
+
380
+ def get_entity_from_datahub(
381
+ self, entity_id: TExternalEntityId, managed_by_datahub: bool = False
382
+ ) -> TExternalEntity:
383
+ """Get external entity from DataHub with caching.
384
+
385
+ Args:
386
+ entity_id: The external entity ID to retrieve
387
+ managed_by_datahub: Whether the entity is managed by DataHub
388
+
389
+ Returns:
390
+ External entity if found or created
391
+ """
392
+ platform_resource_key = entity_id.to_platform_resource_key()
393
+ cache_key = platform_resource_key.id
394
+
395
+ # Thread-safe cache check
396
+ with self._cache_lock:
397
+ cached_result = self.external_entity_cache.get(cache_key)
398
+ if cached_result is not None:
399
+ # Update statistics within cache lock following DataHub patterns
400
+ self.external_entity_cache_hits += 1
401
+ logger.debug(f"Cache hit for get_entity_from_datahub: {cache_key}")
402
+ return cached_result
403
+
404
+ # Cache miss - update statistics within cache lock
405
+ self.external_entity_cache_misses += 1
406
+ logger.debug(f"Cache miss for get_entity_from_datahub {entity_id}")
407
+
408
+ platform_resources = [
409
+ r
410
+ for r in self.search_by_filter(
411
+ ElasticDocumentQuery.create_from(
412
+ (
413
+ PlatformResourceSearchFields.RESOURCE_TYPE,
414
+ self.get_resource_type(),
415
+ ),
416
+ (
417
+ PlatformResourceSearchFields.PRIMARY_KEY,
418
+ platform_resource_key.primary_key,
419
+ ),
420
+ ),
421
+ add_to_cache=False, # We'll cache the result ourselves
422
+ )
423
+ ]
424
+
425
+ result = None
426
+
427
+ if len(platform_resources) == 1:
428
+ platform_resource = platform_resources[0]
429
+ if (
430
+ platform_resource.resource_info
431
+ and platform_resource.resource_info.value
432
+ ):
433
+ entity_obj = platform_resource.resource_info.value.as_pydantic_object(
434
+ self.entity_class
435
+ )
436
+ result = self.entity_class(**entity_obj.dict())
437
+ elif len(platform_resources) > 1:
438
+ # Handle multiple matches - find the one with matching platform instance
439
+ target_platform_instance = entity_id.platform_instance
440
+ for platform_resource in platform_resources:
441
+ if (
442
+ platform_resource.resource_info
443
+ and platform_resource.resource_info.value
444
+ ):
445
+ entity_obj = (
446
+ platform_resource.resource_info.value.as_pydantic_object(
447
+ self.entity_class
448
+ )
449
+ )
450
+ entity = self.entity_class(**entity_obj.dict())
451
+ if entity.get_id().platform_instance == target_platform_instance:
452
+ result = entity
453
+ break
454
+
455
+ if result is None:
456
+ try:
457
+ result = self.create_default_entity(entity_id, managed_by_datahub)
458
+ except Exception as create_error:
459
+ # Track entity creation errors
460
+ self.entity_creation_errors += 1
461
+ logger.error(
462
+ f"Failed to create default entity for {entity_id}: {create_error}"
463
+ )
464
+ raise
465
+
466
+ # Thread-safe cache update
467
+ with self._cache_lock:
468
+ self.external_entity_cache[cache_key] = result
469
+ return result
470
+
471
+ def as_obj(self) -> dict:
472
+ """Implementation of SupportsAsObj protocol for automatic report serialization.
473
+
474
+ Returns cache statistics and error metrics on demand when the repository is included in a report.
475
+ This eliminates the need for manual cache statistics collection.
476
+
477
+ Returns:
478
+ Dictionary containing cache statistics and error metrics with structure:
479
+ {
480
+ "search_by_urn_cache": {"hits": int, "misses": int, "current_size": int, "max_size": int},
481
+ "external_entity_cache": {"hits": int, "misses": int, "current_size": int, "max_size": int},
482
+ "errors": {"cache_updates": int, "cache_invalidations": int, "entity_creations": int, "cache_key_parsing": int}
483
+ }
484
+ """
485
+ return {
486
+ "search_by_urn_cache": {
487
+ "hits": self.urn_search_cache_hits,
488
+ "misses": self.urn_search_cache_misses,
489
+ "current_size": len(self.urn_search_cache),
490
+ "max_size": int(self.urn_search_cache.maxsize),
491
+ },
492
+ "external_entity_cache": {
493
+ "hits": self.external_entity_cache_hits,
494
+ "misses": self.external_entity_cache_misses,
495
+ "current_size": len(self.external_entity_cache),
496
+ "max_size": int(self.external_entity_cache.maxsize),
497
+ },
498
+ "errors": {
499
+ "cache_updates": self.cache_update_errors,
500
+ "cache_invalidations": self.cache_invalidation_errors,
501
+ "entity_creations": self.entity_creation_errors,
502
+ "cache_key_parsing": self.cache_key_parsing_errors,
503
+ },
504
+ }
505
+
506
+
507
+ class ExternalEntityId(BaseModel):
508
+ """
509
+ ExternalEntityId is a unique
510
+ identifier for an ExternalEntity.
511
+ """
512
+
513
+ platform_instance: Optional[str] = None
514
+
515
+ @abstractmethod
516
+ def to_platform_resource_key(self) -> PlatformResourceKey:
517
+ """
518
+ Converts the ExternalEntityId to a PlatformResourceKey.
519
+ """
520
+ pass
521
+
522
+
523
+ class CaseSensitivity(Enum):
524
+ UPPER = "upper"
525
+ LOWER = "lower"
526
+ MIXED = "mixed"
527
+
528
+ @staticmethod
529
+ def detect_case_sensitivity(value: str) -> "CaseSensitivity":
530
+ if value.isupper():
531
+ return CaseSensitivity.UPPER
532
+ elif value.islower():
533
+ return CaseSensitivity.LOWER
534
+ return CaseSensitivity.MIXED
535
+
536
+ @staticmethod
537
+ def detect_for_many(values: List[str]) -> "CaseSensitivity":
538
+ """
539
+ Detects the case sensitivity for a list of strings.
540
+ Returns CaseSensitivity.MIXED if the case sensitivity is mixed.
541
+ """
542
+ if len(values) == 0:
543
+ return CaseSensitivity.MIXED
544
+
545
+ if all(
546
+ CaseSensitivity.detect_case_sensitivity(value) == CaseSensitivity.UPPER
547
+ for value in values
548
+ ):
549
+ return CaseSensitivity.UPPER
550
+ elif all(
551
+ CaseSensitivity.detect_case_sensitivity(value) == CaseSensitivity.LOWER
552
+ for value in values
553
+ ):
554
+ return CaseSensitivity.LOWER
555
+ return CaseSensitivity.MIXED
556
+
557
+
558
+ class LinkedResourceSet(BaseModel):
559
+ """
560
+ A LinkedResourceSet is a set of DataHub URNs that are linked to an ExternalEntity.
561
+ """
562
+
563
+ urns: List[str]
564
+
565
+ def _has_conflict(self, urn: Urn) -> bool:
566
+ """
567
+ Detects if the urn is safe to add into the set
568
+ This is used to detect conflicts between DataHub URNs that are linked to
569
+ the same ExternalEntity.
570
+ e.g. Case sensitivity of URNs
571
+ Mixing tags and terms in the same set etc.
572
+ Return True if the urn is not safe to add into the set, else False.
573
+ If the urn is already in the set, we don't need to add it again, but
574
+ that is not a conflict.
575
+ """
576
+ if urn.urn() in self.urns:
577
+ return False
578
+
579
+ # Detect the entity_type of the urns in the existing set
580
+ detected_entity_type = None
581
+ for existing_urn in self.urns:
582
+ try:
583
+ parsed_urn = Urn.from_string(existing_urn)
584
+ entity_type = parsed_urn.entity_type
585
+ if detected_entity_type is None:
586
+ detected_entity_type = entity_type
587
+ elif detected_entity_type != entity_type:
588
+ logger.warning(
589
+ f"Detected entity_type {detected_entity_type} is not equals to {entity_type}"
590
+ )
591
+ return True
592
+ except ValueError:
593
+ # Not a valid URN
594
+ logger.warning(f"Invalid URN {existing_urn} in LinkedResourceSet")
595
+ return True
596
+ try:
597
+ parsed_urn = urn
598
+ if (
599
+ detected_entity_type is not None
600
+ and parsed_urn.entity_type != detected_entity_type
601
+ ):
602
+ logger.warning(
603
+ f"Detected entity_type {detected_entity_type} is not equals to parsed_urn's entity_type: {parsed_urn.entity_type}"
604
+ )
605
+ return True
606
+ except ValueError:
607
+ # Not a valid URN
608
+ logger.warning(f"Invalid URN: {urn} in LinkedResourceSet")
609
+ return True
610
+ return False
611
+
612
+ def add(self, urn: Union[str, Urn]) -> bool:
613
+ """
614
+ Adds a URN to the set.
615
+ Returns True if the URN was added, False if it was already in the set.
616
+ Raises a ValueError if the URN is in conflict with the existing set.
617
+ """
618
+ # Deduplicate the URNs if we have somehow duplicate items from concurrent runs
619
+ self.urns = list(set(self.urns))
620
+ if isinstance(urn, str):
621
+ urn = Urn.from_string(urn)
622
+ if self._has_conflict(urn):
623
+ raise ValueError(f"Conflict detected when adding URN {urn} to the set")
624
+ if urn.urn() not in self.urns:
625
+ self.urns.append(urn.urn())
626
+ return True
627
+ return False
628
+
629
+
630
+ class ExternalEntity(BaseModel):
631
+ """
632
+ An ExternalEntity is a representation of an entity that external to DataHub
633
+ but could be linked to one or more DataHub entities.
634
+ """
635
+
636
+ @abstractmethod
637
+ def is_managed_by_datahub(self) -> bool:
638
+ """
639
+ Returns whether the entity is managed by DataHub.
640
+ """
641
+ pass
642
+
643
+ @abstractmethod
644
+ def datahub_linked_resources(self) -> LinkedResourceSet:
645
+ """
646
+ Returns the URNs of the DataHub entities linked to the external entity.
647
+ Empty list if no linked entities.
648
+ """
649
+ pass
650
+
651
+ @abstractmethod
652
+ def as_platform_resource(self) -> PlatformResource:
653
+ """
654
+ Converts the ExternalEntity to a PlatformResource.
655
+ """
656
+ pass
657
+
658
+ @abstractmethod
659
+ def get_id(self) -> ExternalEntityId:
660
+ """
661
+ Returns the ExternalEntityId for the ExternalEntity.
662
+ """
663
+ pass
664
+
665
+ @classmethod
666
+ @abstractmethod
667
+ def create_default(
668
+ cls, entity_id: "ExternalEntityId", managed_by_datahub: bool
669
+ ) -> "ExternalEntity":
670
+ """
671
+ Create a default entity instance when none found in DataHub.
672
+
673
+ Args:
674
+ entity_id: The external entity ID (concrete implementations can expect their specific types)
675
+ managed_by_datahub: Whether the entity is managed by DataHub
676
+
677
+ Returns:
678
+ Default entity instance
679
+ """
680
+ pass
681
+
682
+
683
+ class MissingExternalEntity(ExternalEntity):
684
+ id: ExternalEntityId
685
+
686
+ def is_managed_by_datahub(self) -> bool:
687
+ return False
688
+
689
+ def datahub_linked_resources(self) -> LinkedResourceSet:
690
+ return LinkedResourceSet(urns=[])
691
+
692
+ def as_platform_resource(self) -> Optional[PlatformResource]: # type: ignore[override]
693
+ return None
694
+
695
+ def get_id(self) -> ExternalEntityId:
696
+ return self.id
697
+
698
+ @classmethod
699
+ def create_default(
700
+ cls, entity_id: ExternalEntityId, managed_by_datahub: bool
701
+ ) -> "MissingExternalEntity":
702
+ """Create a missing external entity."""
703
+ return cls(id=entity_id)
704
+
705
+
706
+ class ExternalSystem:
707
+ @abstractmethod
708
+ def exists(self, external_entity_id: ExternalEntityId) -> bool:
709
+ """
710
+ Returns whether the ExternalEntityId exists in the external system.
711
+ """
712
+ pass
713
+
714
+ @abstractmethod
715
+ def get(
716
+ self,
717
+ external_entity_id: ExternalEntityId,
718
+ platform_resource_repository: PlatformResourceRepository,
719
+ ) -> Optional[ExternalEntity]:
720
+ """
721
+ Returns the ExternalEntity for the ExternalEntityId.
722
+ Uses the platform resource repository to enrich the ExternalEntity with DataHub URNs.
723
+ """
724
+ pass