acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,402 @@
1
+ {
2
+ "entities": {
3
+ "dataJob": {
4
+ "dataJobInputOutput": {
5
+ "aspect": "dataJobInputOutput",
6
+ "fields": [
7
+ {
8
+ "name": "inputDatasets",
9
+ "path": "inputDatasets",
10
+ "isLineage": true,
11
+ "relationship": {
12
+ "name": "Consumes",
13
+ "entityTypes": [
14
+ "dataset"
15
+ ],
16
+ "isLineage": true
17
+ }
18
+ },
19
+ {
20
+ "name": "inputDatasetEdges",
21
+ "path": "inputDatasetEdges",
22
+ "isLineage": true,
23
+ "relationship": {
24
+ "name": "Consumes",
25
+ "entityTypes": [
26
+ "dataset"
27
+ ],
28
+ "isLineage": true
29
+ }
30
+ },
31
+ {
32
+ "name": "outputDatasets",
33
+ "path": "outputDatasets",
34
+ "isLineage": true,
35
+ "relationship": {
36
+ "name": "Produces",
37
+ "entityTypes": [
38
+ "dataset"
39
+ ],
40
+ "isLineage": true
41
+ }
42
+ },
43
+ {
44
+ "name": "outputDatasetEdges",
45
+ "path": "outputDatasetEdges",
46
+ "isLineage": true,
47
+ "relationship": {
48
+ "name": "Produces",
49
+ "entityTypes": [
50
+ "dataset"
51
+ ],
52
+ "isLineage": true
53
+ }
54
+ },
55
+ {
56
+ "name": "inputDatajobs",
57
+ "path": "inputDatajobs",
58
+ "isLineage": true,
59
+ "relationship": {
60
+ "name": "DownstreamOf",
61
+ "entityTypes": [
62
+ "dataJob"
63
+ ],
64
+ "isLineage": true
65
+ }
66
+ },
67
+ {
68
+ "name": "inputDatajobEdges",
69
+ "path": "inputDatajobEdges",
70
+ "isLineage": true,
71
+ "relationship": {
72
+ "name": "DownstreamOf",
73
+ "entityTypes": [
74
+ "dataJob"
75
+ ],
76
+ "isLineage": true
77
+ }
78
+ }
79
+ ]
80
+ }
81
+ },
82
+ "dataProcessInstance": {
83
+ "dataProcessInstanceOutput": {
84
+ "aspect": "dataProcessInstanceOutput",
85
+ "fields": [
86
+ {
87
+ "name": "outputEdges",
88
+ "path": "outputEdges",
89
+ "isLineage": true,
90
+ "relationship": {
91
+ "name": "DataProcessInstanceProduces",
92
+ "entityTypes": [
93
+ "dataset",
94
+ "mlModel",
95
+ "dataProcessInstance"
96
+ ],
97
+ "isLineage": true
98
+ }
99
+ }
100
+ ]
101
+ },
102
+ "dataProcessInstanceInput": {
103
+ "aspect": "dataProcessInstanceInput",
104
+ "fields": [
105
+ {
106
+ "name": "inputEdges",
107
+ "path": "inputEdges",
108
+ "isLineage": true,
109
+ "relationship": {
110
+ "name": "DataProcessInstanceConsumes",
111
+ "entityTypes": [
112
+ "dataset",
113
+ "mlModel",
114
+ "dataProcessInstance"
115
+ ],
116
+ "isLineage": true
117
+ }
118
+ }
119
+ ]
120
+ }
121
+ },
122
+ "dataProcess": {
123
+ "dataProcessInfo": {
124
+ "aspect": "dataProcessInfo",
125
+ "fields": [
126
+ {
127
+ "name": "inputs",
128
+ "path": "inputs",
129
+ "isLineage": true,
130
+ "relationship": {
131
+ "name": "Consumes",
132
+ "entityTypes": [
133
+ "dataset"
134
+ ],
135
+ "isLineage": true
136
+ }
137
+ },
138
+ {
139
+ "name": "outputs",
140
+ "path": "outputs",
141
+ "isLineage": true,
142
+ "relationship": {
143
+ "name": "Consumes",
144
+ "entityTypes": [
145
+ "dataset"
146
+ ],
147
+ "isLineage": true
148
+ }
149
+ }
150
+ ]
151
+ }
152
+ },
153
+ "dataset": {
154
+ "upstreamLineage": {
155
+ "aspect": "upstreamLineage",
156
+ "fields": [
157
+ {
158
+ "name": "dataset",
159
+ "path": "upstreams.dataset",
160
+ "isLineage": true,
161
+ "relationship": {
162
+ "name": "DownstreamOf",
163
+ "entityTypes": [
164
+ "dataset"
165
+ ],
166
+ "isLineage": true
167
+ }
168
+ }
169
+ ]
170
+ }
171
+ },
172
+ "chart": {
173
+ "chartInfo": {
174
+ "aspect": "chartInfo",
175
+ "fields": [
176
+ {
177
+ "name": "inputs",
178
+ "path": "inputs",
179
+ "isLineage": true,
180
+ "relationship": {
181
+ "name": "Consumes",
182
+ "entityTypes": [
183
+ "dataset"
184
+ ],
185
+ "isLineage": true
186
+ }
187
+ },
188
+ {
189
+ "name": "inputEdges",
190
+ "path": "inputEdges",
191
+ "isLineage": true,
192
+ "relationship": {
193
+ "name": "Consumes",
194
+ "entityTypes": [
195
+ "dataset",
196
+ "chart"
197
+ ],
198
+ "isLineage": true
199
+ }
200
+ }
201
+ ]
202
+ }
203
+ },
204
+ "dashboard": {
205
+ "dashboardInfo": {
206
+ "aspect": "dashboardInfo",
207
+ "fields": [
208
+ {
209
+ "name": "charts",
210
+ "path": "charts",
211
+ "isLineage": true,
212
+ "relationship": {
213
+ "name": "Contains",
214
+ "entityTypes": [
215
+ "chart"
216
+ ],
217
+ "isLineage": true
218
+ }
219
+ },
220
+ {
221
+ "name": "chartEdges",
222
+ "path": "chartEdges",
223
+ "isLineage": true,
224
+ "relationship": {
225
+ "name": "Contains",
226
+ "entityTypes": [
227
+ "chart"
228
+ ],
229
+ "isLineage": true
230
+ }
231
+ },
232
+ {
233
+ "name": "datasets",
234
+ "path": "datasets",
235
+ "isLineage": true,
236
+ "relationship": {
237
+ "name": "Consumes",
238
+ "entityTypes": [
239
+ "dataset"
240
+ ],
241
+ "isLineage": true
242
+ }
243
+ },
244
+ {
245
+ "name": "datasetEdges",
246
+ "path": "datasetEdges",
247
+ "isLineage": true,
248
+ "relationship": {
249
+ "name": "Consumes",
250
+ "entityTypes": [
251
+ "dataset"
252
+ ],
253
+ "isLineage": true
254
+ }
255
+ },
256
+ {
257
+ "name": "dashboards",
258
+ "path": "dashboards",
259
+ "isLineage": true,
260
+ "relationship": {
261
+ "name": "DashboardContainsDashboard",
262
+ "entityTypes": [
263
+ "dashboard"
264
+ ],
265
+ "isLineage": true
266
+ }
267
+ }
268
+ ]
269
+ }
270
+ },
271
+ "mlModelGroup": {
272
+ "mlModelGroupProperties": {
273
+ "aspect": "mlModelGroupProperties",
274
+ "fields": [
275
+ {
276
+ "name": "trainingJobs",
277
+ "path": "trainingJobs",
278
+ "isLineage": true,
279
+ "relationship": {
280
+ "name": "TrainedBy",
281
+ "entityTypes": [
282
+ "dataJob",
283
+ "dataProcessInstance"
284
+ ],
285
+ "isLineage": true
286
+ }
287
+ },
288
+ {
289
+ "name": "downstreamJobs",
290
+ "path": "downstreamJobs",
291
+ "isLineage": true,
292
+ "relationship": {
293
+ "name": "UsedBy",
294
+ "entityTypes": [
295
+ "dataJob",
296
+ "dataProcessInstance"
297
+ ],
298
+ "isLineage": true
299
+ }
300
+ }
301
+ ]
302
+ }
303
+ },
304
+ "mlFeature": {
305
+ "mlFeatureProperties": {
306
+ "aspect": "mlFeatureProperties",
307
+ "fields": [
308
+ {
309
+ "name": "sources",
310
+ "path": "sources",
311
+ "isLineage": true,
312
+ "relationship": {
313
+ "name": "DerivedFrom",
314
+ "entityTypes": [
315
+ "dataset"
316
+ ],
317
+ "isLineage": true
318
+ }
319
+ }
320
+ ]
321
+ }
322
+ },
323
+ "mlPrimaryKey": {
324
+ "mlPrimaryKeyProperties": {
325
+ "aspect": "mlPrimaryKeyProperties",
326
+ "fields": [
327
+ {
328
+ "name": "sources",
329
+ "path": "sources",
330
+ "isLineage": true,
331
+ "relationship": {
332
+ "name": "DerivedFrom",
333
+ "entityTypes": [
334
+ "dataset"
335
+ ],
336
+ "isLineage": true
337
+ }
338
+ }
339
+ ]
340
+ }
341
+ },
342
+ "mlModel": {
343
+ "mlModelProperties": {
344
+ "aspect": "mlModelProperties",
345
+ "fields": [
346
+ {
347
+ "name": "trainingJobs",
348
+ "path": "trainingJobs",
349
+ "isLineage": true,
350
+ "relationship": {
351
+ "name": "TrainedBy",
352
+ "entityTypes": [
353
+ "dataJob",
354
+ "dataProcessInstance"
355
+ ],
356
+ "isLineage": true
357
+ }
358
+ },
359
+ {
360
+ "name": "downstreamJobs",
361
+ "path": "downstreamJobs",
362
+ "isLineage": true,
363
+ "relationship": {
364
+ "name": "UsedBy",
365
+ "entityTypes": [
366
+ "dataJob",
367
+ "dataProcessInstance"
368
+ ],
369
+ "isLineage": true
370
+ }
371
+ },
372
+ {
373
+ "name": "mlFeatures",
374
+ "path": "mlFeatures",
375
+ "isLineage": true,
376
+ "relationship": {
377
+ "name": "Consumes",
378
+ "entityTypes": [
379
+ "mlFeature"
380
+ ],
381
+ "isLineage": true
382
+ }
383
+ },
384
+ {
385
+ "name": "groups",
386
+ "path": "groups",
387
+ "isLineage": true,
388
+ "relationship": {
389
+ "name": "MemberOf",
390
+ "entityTypes": [
391
+ "mlModelGroup"
392
+ ],
393
+ "isLineage": true
394
+ }
395
+ }
396
+ ]
397
+ }
398
+ }
399
+ },
400
+ "generated_by": "metadata-ingestion/scripts/modeldocgen.py",
401
+ "generated_at": "2025-08-05T19:29:49.306404+00:00"
402
+ }
@@ -0,0 +1,177 @@
1
+ import json
2
+ import logging
3
+ from dataclasses import dataclass
4
+ from functools import lru_cache
5
+ from pathlib import Path
6
+ from typing import Dict, List, Optional
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ # Global cache for lineage data to avoid repeated file reads
11
+ _lineage_data: Optional["LineageData"] = None
12
+
13
+
14
+ @dataclass
15
+ class Field:
16
+ name: str
17
+ path: str
18
+ isLineage: bool
19
+ relationship: Optional[Dict]
20
+
21
+
22
+ @dataclass
23
+ class Aspect:
24
+ name: str
25
+ fields: List[Field]
26
+
27
+
28
+ @dataclass
29
+ class Entity:
30
+ name: str
31
+ aspects: Dict[str, Aspect]
32
+
33
+
34
+ @dataclass
35
+ class LineageData:
36
+ # entity name -> aspect
37
+ entities: Dict[str, Entity]
38
+ generated_by: str
39
+ generated_at: str
40
+
41
+
42
+ def get_lineage_data() -> LineageData:
43
+ """
44
+ This is experimental internal API subject to breaking changes without prior notice.
45
+ """
46
+ global _lineage_data
47
+
48
+ if _lineage_data is not None:
49
+ return _lineage_data
50
+
51
+ raw_data = _load_lineage_data()
52
+ _entities = raw_data.get("entities", {})
53
+ for entity_name, entity_data in _entities.items():
54
+ entity = Entity(
55
+ name=entity_name,
56
+ aspects={},
57
+ )
58
+ for aspect_name, aspect_data in entity_data.items():
59
+ entity.aspects[aspect_name] = Aspect(
60
+ name=aspect_name,
61
+ fields=[
62
+ Field(
63
+ name=field["name"],
64
+ path=field["path"],
65
+ isLineage=field["isLineage"],
66
+ relationship=field.get("relationship", None),
67
+ )
68
+ for field in aspect_data.get("fields", [])
69
+ ],
70
+ )
71
+ _entities[entity_name] = entity
72
+
73
+ _lineage_data = LineageData(
74
+ entities=_entities,
75
+ generated_by=raw_data.get("generated_by", ""),
76
+ generated_at=raw_data.get("generated_at", ""),
77
+ )
78
+ return _lineage_data
79
+
80
+
81
+ def get_all_aspect_names() -> List[str]:
82
+ """
83
+ This is experimental internal API subject to breaking changes without prior notice.
84
+ """
85
+ entities = get_lineage_data().entities
86
+ if not entities:
87
+ return []
88
+ first_entity = next(iter(entities.values()))
89
+ return list(first_entity.aspects.keys())
90
+
91
+
92
+ def _load_lineage_data() -> Dict:
93
+ """
94
+ This is experimental internal API subject to breaking changes without prior notice.
95
+
96
+ Load lineage data from the autogenerated lineage.json file.
97
+
98
+ Returns:
99
+ Dict containing the lineage information, or empty dict if file doesn't exist
100
+
101
+ Raises:
102
+ json.JSONDecodeError: If lineage.json is malformed
103
+ """
104
+ # Get the path to lineage.json relative to this file
105
+ current_file = Path(__file__)
106
+ lineage_file = current_file.parent / "lineage.json"
107
+
108
+ if not lineage_file.exists():
109
+ logger.warning(
110
+ f"Lineage file not found: {lineage_file}. "
111
+ "This may indicate a packaging issue. Lineage detection will be disabled."
112
+ )
113
+ return {}
114
+
115
+ try:
116
+ with open(lineage_file, "r") as f:
117
+ return json.load(f)
118
+ except json.JSONDecodeError as e:
119
+ logger.error(
120
+ f"Failed to parse lineage.json: {e}. Lineage detection will be disabled."
121
+ )
122
+ return {}
123
+
124
+
125
+ def _get_fields(entity_type: str, aspect_name: str) -> List[Dict]:
126
+ """
127
+ This is experimental internal API subject to breaking changes without prior notice.
128
+ """
129
+ lineage_data = get_lineage_data()
130
+ entity = lineage_data.entities.get(entity_type)
131
+ if not entity:
132
+ return []
133
+
134
+ aspect = entity.aspects.get(aspect_name)
135
+ if not aspect:
136
+ return []
137
+
138
+ return [
139
+ {
140
+ "name": field.name,
141
+ "path": field.path,
142
+ "isLineage": field.isLineage,
143
+ "relationship": field.relationship,
144
+ }
145
+ for field in aspect.fields
146
+ ]
147
+
148
+
149
+ def _get_lineage_fields(entity_type: str, aspect_name: str) -> List[Dict]:
150
+ """
151
+ This is experimental internal API subject to breaking changes without prior notice.
152
+ """
153
+ return [
154
+ field
155
+ for field in _get_fields(entity_type, aspect_name)
156
+ if field.get("isLineage", False)
157
+ ]
158
+
159
+
160
+ @lru_cache(maxsize=128)
161
+ def is_lineage_aspect(entity_type: str, aspect_name: str) -> bool:
162
+ """
163
+ This is experimental internal API subject to breaking changes without prior notice.
164
+ """
165
+ return len(_get_lineage_fields(entity_type, aspect_name)) > 0
166
+
167
+
168
+ def clear_cache() -> None:
169
+ """
170
+ This is experimental internal API subject to breaking changes without prior notice.
171
+
172
+ Clear the internal cache of lineage data.
173
+
174
+ This is useful for testing or when the lineage.json file has been updated.
175
+ """
176
+ global _lineage_data
177
+ _lineage_data = None
@@ -125,7 +125,7 @@ class AvroToMceSchemaConverter:
125
125
  self._prefix_name_stack: PrefixNameStack = [self.version_string]
126
126
  # Tracks the fields on the current path.
127
127
  self._fields_stack: FieldStack = []
128
- # Tracks the record types seen so far. Used to prevent infinite recursion with recursive types.
128
+ # Stack of record types currently being processed. Used to prevent infinite recursion with recursive types.
129
129
  self._record_types_seen: List[str] = []
130
130
  # If part of the key-schema or value-schema.
131
131
  self._is_key_schema = is_key_schema
@@ -290,6 +290,12 @@ class AvroToMceSchemaConverter:
290
290
  This way we can use the type/description of the non-null type if needed.
291
291
  """
292
292
 
293
+ # props to skip when building jsonProps
294
+ json_props_to_skip = [
295
+ "_nullable",
296
+ "native_data_type",
297
+ ]
298
+
293
299
  def __init__(
294
300
  self,
295
301
  schema: SchemaOrField,
@@ -362,6 +368,7 @@ class AvroToMceSchemaConverter:
362
368
  merged_props: Dict[str, Any] = {}
363
369
  merged_props.update(self._schema.other_props)
364
370
  merged_props.update(schema.other_props)
371
+ merged_props.update(actual_schema.other_props)
365
372
 
366
373
  # Parse meta_mapping
367
374
  meta_aspects: Dict[str, Any] = {}
@@ -406,6 +413,16 @@ class AvroToMceSchemaConverter:
406
413
  or self._actual_schema.props.get("logicalType"),
407
414
  )
408
415
 
416
+ json_props: Optional[Dict[str, Any]] = (
417
+ {
418
+ k: v
419
+ for k, v in merged_props.items()
420
+ if k not in self.json_props_to_skip
421
+ }
422
+ if merged_props
423
+ else None
424
+ )
425
+
409
426
  field = SchemaField(
410
427
  fieldPath=field_path,
411
428
  # Populate it with the simple native type for now.
@@ -420,7 +437,7 @@ class AvroToMceSchemaConverter:
420
437
  isPartOfKey=self._converter._is_key_schema,
421
438
  globalTags=tags_aspect,
422
439
  glossaryTerms=meta_terms_aspect,
423
- jsonProps=json.dumps(merged_props) if merged_props else None,
440
+ jsonProps=json.dumps(json_props) if json_props else None,
424
441
  )
425
442
  yield field
426
443
 
@@ -505,10 +522,12 @@ class AvroToMceSchemaConverter:
505
522
  # Handle recursive record definitions
506
523
  recurse: bool = True
507
524
  if isinstance(schema, avro.schema.RecordSchema):
508
- if schema.fullname not in self._record_types_seen:
509
- self._record_types_seen.append(schema.fullname)
510
- else:
525
+ # Only prevent recursion if we're currently processing this record type (true recursion)
526
+ # Allow reuse of the same record type in different contexts
527
+ if schema.fullname in self._record_types_seen:
511
528
  recurse = False
529
+ else:
530
+ self._record_types_seen.append(schema.fullname)
512
531
 
513
532
  # Adjust actual schema if needed
514
533
  actual_schema = self._get_underlying_type_if_option_as_union(schema, schema)
@@ -542,6 +561,13 @@ class AvroToMceSchemaConverter:
542
561
  for sub_schema in self._get_sub_schemas(actual_schema):
543
562
  yield from self._to_mce_fields(sub_schema)
544
563
 
564
+ # Clean up the processing stack
565
+ if (
566
+ isinstance(schema, avro.schema.RecordSchema)
567
+ and schema.fullname in self._record_types_seen
568
+ ):
569
+ self._record_types_seen.remove(schema.fullname)
570
+
545
571
  def _gen_non_nested_to_mce_fields(
546
572
  self, schema: SchemaOrField
547
573
  ) -> Iterable[SchemaField]:
@@ -90,6 +90,11 @@ class ClassificationHandler:
90
90
 
91
91
  def get_classifiers(self) -> List[Classifier]:
92
92
  classifiers = []
93
+ if (
94
+ not isinstance(self.config, ClassificationSourceConfigMixin)
95
+ or self.config.classification is None
96
+ ):
97
+ return classifiers
93
98
 
94
99
  for classifier in self.config.classification.classifiers:
95
100
  classifier_class = classifier_registry.get(classifier.type)
@@ -319,8 +324,10 @@ def classification_workunit_processor(
319
324
  partial(
320
325
  data_reader.get_sample_data_for_table,
321
326
  table_id,
322
- classification_handler.config.classification.sample_size
323
- * SAMPLE_SIZE_MULTIPLIER,
327
+ int(
328
+ classification_handler.config.classification.sample_size
329
+ * SAMPLE_SIZE_MULTIPLIER
330
+ ),
324
331
  **(data_reader_kwargs or {}),
325
332
  )
326
333
  if data_reader