acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -36,7 +36,10 @@ from datahub.ingestion.api.decorators import (
36
36
  )
37
37
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
38
38
  from datahub.ingestion.api.workunit import MetadataWorkUnit
39
- from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
39
+ from datahub.ingestion.source.common.subtypes import (
40
+ DatasetContainerSubTypes,
41
+ SourceCapabilityModifier,
42
+ )
40
43
  from datahub.ingestion.source.schema_inference.object import (
41
44
  SchemaDescription,
42
45
  construct_schema,
@@ -249,6 +252,13 @@ def construct_schema_pymongo(
249
252
  @support_status(SupportStatus.CERTIFIED)
250
253
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
251
254
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
255
+ @capability(
256
+ SourceCapability.CONTAINERS,
257
+ "Enabled by default",
258
+ subtype_modifier=[
259
+ SourceCapabilityModifier.DATABASE,
260
+ ],
261
+ )
252
262
  @dataclass
253
263
  class MongoDBSource(StatefulIngestionSourceBase):
254
264
  """
@@ -1,74 +1,56 @@
1
1
  import logging
2
- import time
3
2
  from dataclasses import dataclass
4
- from typing import Any, Dict, Iterable, List, Optional, Type, Union
3
+ from typing import Dict, Iterable, List, Optional, Tuple
5
4
 
6
5
  import pandas as pd
7
6
  from neo4j import GraphDatabase
8
- from pydantic.fields import Field
7
+ from pydantic import Field
9
8
 
10
9
  from datahub.configuration.source_common import (
11
10
  EnvConfigMixin,
11
+ PlatformInstanceConfigMixin,
12
12
  )
13
- from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn
14
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
15
13
  from datahub.ingestion.api.common import PipelineContext
16
14
  from datahub.ingestion.api.decorators import (
17
15
  SupportStatus,
16
+ capability,
18
17
  config_class,
19
18
  platform_name,
20
19
  support_status,
21
20
  )
22
21
  from datahub.ingestion.api.source import (
23
22
  MetadataWorkUnitProcessor,
23
+ SourceCapability,
24
24
  )
25
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
26
26
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
27
27
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
28
28
  StaleEntityRemovalHandler,
29
+ StatefulStaleMetadataRemovalConfig,
29
30
  )
30
31
  from datahub.ingestion.source.state.stateful_ingestion_base import (
31
32
  StatefulIngestionConfigBase,
32
33
  StatefulIngestionReport,
33
34
  StatefulIngestionSourceBase,
34
35
  )
35
- from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
36
- from datahub.metadata.schema_classes import (
37
- AuditStampClass,
38
- BooleanTypeClass,
39
- DatasetPropertiesClass,
40
- DateTypeClass,
41
- NullTypeClass,
42
- NumberTypeClass,
43
- OtherSchemaClass,
44
- SchemaFieldClass,
45
- SchemaMetadataClass,
46
- StringTypeClass,
47
- SubTypesClass,
48
- UnionTypeClass,
49
- )
36
+ from datahub.sdk.dataset import Dataset
50
37
 
51
38
  log = logging.getLogger(__name__)
52
39
  logging.basicConfig(level=logging.INFO)
53
40
 
54
- _type_mapping: Dict[Union[Type, str], Type] = {
55
- "list": UnionTypeClass,
56
- "boolean": BooleanTypeClass,
57
- "integer": NumberTypeClass,
58
- "local_date_time": DateTypeClass,
59
- "float": NumberTypeClass,
60
- "string": StringTypeClass,
61
- "date": DateTypeClass,
62
- "node": StringTypeClass,
63
- "relationship": StringTypeClass,
64
- }
41
+ # Neo4j object types
42
+ _NODE = "node"
43
+ _RELATIONSHIP = "relationship"
65
44
 
66
45
 
67
- class Neo4jConfig(EnvConfigMixin, StatefulIngestionConfigBase):
46
+ class Neo4jConfig(
47
+ StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
48
+ ):
68
49
  username: str = Field(description="Neo4j Username")
69
50
  password: str = Field(description="Neo4j Password")
70
51
  uri: str = Field(description="The URI for the Neo4j server")
71
- env: str = Field(description="Neo4j env")
52
+
53
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
72
54
 
73
55
 
74
56
  @dataclass
@@ -79,114 +61,111 @@ class Neo4jSourceReport(StatefulIngestionReport):
79
61
 
80
62
  @platform_name("Neo4j", id="neo4j")
81
63
  @config_class(Neo4jConfig)
64
+ @capability(
65
+ SourceCapability.PLATFORM_INSTANCE, "Supported via the `platform_instance` config"
66
+ )
82
67
  @support_status(SupportStatus.CERTIFIED)
83
68
  class Neo4jSource(StatefulIngestionSourceBase):
84
- NODE = "node"
85
- RELATIONSHIP = "relationship"
86
- PLATFORM = "neo4j"
69
+ config: Neo4jConfig
70
+ report: Neo4jSourceReport
87
71
 
88
- def __init__(self, ctx: PipelineContext, config: Neo4jConfig):
72
+ def __init__(self, config: Neo4jConfig, ctx: PipelineContext):
73
+ super().__init__(config, ctx)
89
74
  self.ctx = ctx
90
75
  self.config = config
76
+ self.platform = "neo4j"
91
77
  self.report: Neo4jSourceReport = Neo4jSourceReport()
92
78
 
93
79
  @classmethod
94
- def create(cls, config_dict, ctx):
80
+ def create(cls, config_dict: Dict, ctx: PipelineContext) -> "Neo4jSource":
95
81
  config = Neo4jConfig.parse_obj(config_dict)
96
- return cls(ctx, config)
97
-
98
- def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
99
- type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
100
- return SchemaFieldDataType(type=type_class())
101
-
102
- def get_schema_field_class(
103
- self, col_name: str, col_type: str, **kwargs: Any
104
- ) -> SchemaFieldClass:
105
- if kwargs["obj_type"] == self.NODE and col_type == self.RELATIONSHIP:
106
- col_type = self.NODE
107
- else:
108
- col_type = col_type
109
- return SchemaFieldClass(
110
- fieldPath=col_name,
111
- type=self.get_field_type(col_type),
112
- nativeDataType=col_type,
113
- description=col_type.upper()
114
- if col_type in (self.NODE, self.RELATIONSHIP)
115
- else col_type,
116
- lastModified=AuditStampClass(
117
- time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion"
118
- ),
82
+ return cls(config, ctx)
83
+
84
+ def create_schema_field_tuple(
85
+ self, col_name: str, col_type: str, obj_type: Optional[str]
86
+ ) -> Tuple[str, str, str]:
87
+ """Convert Neo4j property to (field_name, field_type, description) tuple."""
88
+ # Special case: when a node has a relationship-typed property, treat it as a node reference
89
+ # This ensures relationship properties within nodes are described as "NODE" rather than "RELATIONSHIP"
90
+ column_type = (
91
+ _NODE if obj_type == _NODE and col_type == _RELATIONSHIP else col_type
119
92
  )
120
93
 
121
- def add_properties(
94
+ description = (
95
+ column_type.upper()
96
+ if column_type in (_NODE, _RELATIONSHIP)
97
+ else column_type
98
+ )
99
+
100
+ return (col_name, column_type, description)
101
+
102
+ def get_subtype_from_obj_type(self, obj_type: str) -> str:
103
+ """Map Neo4j object type to DataHub subtype."""
104
+ if obj_type == _NODE:
105
+ return DatasetSubTypes.NEO4J_NODE
106
+ elif obj_type == _RELATIONSHIP:
107
+ return DatasetSubTypes.NEO4J_RELATIONSHIP
108
+ return DatasetSubTypes.NEO4J_NODE # default fallback
109
+
110
+ def create_neo4j_dataset(
122
111
  self,
123
112
  dataset: str,
113
+ columns: list,
114
+ obj_type: Optional[str] = None,
124
115
  description: Optional[str] = None,
125
- custom_properties: Optional[Dict[str, str]] = None,
126
- ) -> MetadataChangeProposalWrapper:
127
- dataset_properties = DatasetPropertiesClass(
128
- description=description,
129
- customProperties=custom_properties,
130
- )
131
- return MetadataChangeProposalWrapper(
132
- entityUrn=make_dataset_urn(
133
- platform=self.PLATFORM, name=dataset, env=self.config.env
134
- ),
135
- aspect=dataset_properties,
136
- )
137
-
138
- def generate_neo4j_object(
139
- self, dataset: str, columns: list, obj_type: Optional[str] = None
140
- ) -> MetadataChangeProposalWrapper:
116
+ ) -> Optional[Dataset]:
117
+ """Create Dataset entity with Neo4j schema and metadata."""
141
118
  try:
142
- fields = [
143
- self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
119
+ schema_fields = [
120
+ self.create_schema_field_tuple(
121
+ col_name=key, col_type=value.lower(), obj_type=obj_type
122
+ )
144
123
  for d in columns
145
124
  for key, value in d.items()
146
125
  ]
147
- mcp = MetadataChangeProposalWrapper(
148
- entityUrn=make_dataset_urn(
149
- platform=self.PLATFORM, name=dataset, env=self.config.env
150
- ),
151
- aspect=SchemaMetadataClass(
152
- schemaName=dataset,
153
- platform=make_data_platform_urn(self.PLATFORM),
154
- version=0,
155
- hash="",
156
- platformSchema=OtherSchemaClass(rawSchema=""),
157
- lastModified=AuditStampClass(
158
- time=round(time.time() * 1000),
159
- actor="urn:li:corpuser:ingestion",
160
- ),
161
- fields=fields,
162
- ),
126
+
127
+ return Dataset(
128
+ platform=self.platform,
129
+ name=dataset,
130
+ platform_instance=self.config.platform_instance,
131
+ env=self.config.env,
132
+ schema=schema_fields,
133
+ subtype=self.get_subtype_from_obj_type(obj_type or _NODE),
134
+ description=description,
163
135
  )
164
- self.report.obj_created += 1
136
+
165
137
  except Exception as e:
166
138
  log.error(e)
167
- self.report.obj_failures += 1
168
- return mcp
139
+ self.report.report_failure(
140
+ message="Failed to process dataset",
141
+ context=dataset,
142
+ exc=e,
143
+ )
144
+ return None
169
145
 
170
- def get_neo4j_metadata(self, query: str) -> pd.DataFrame:
146
+ def get_neo4j_metadata(self, query: str) -> Optional[pd.DataFrame]:
171
147
  driver = GraphDatabase.driver(
172
148
  self.config.uri, auth=(self.config.username, self.config.password)
173
149
  )
174
150
  """
175
- This process retrieves the metadata for Neo4j objects using an APOC query, which returns a dictionary
176
- with two columns: key and value. The key represents the Neo4j object, while the value contains the
177
- corresponding metadata.
151
+ This process retrieves the metadata for Neo4j objects using an APOC query,
152
+ which returns a dictionary with two columns: key and value. The key represents
153
+ the Neo4j object, while the value contains the corresponding metadata.
178
154
 
179
- When data is returned from Neo4j, much of the relationship metadata is stored with the relevant node's
180
- metadata. Consequently, the objects are organized into two separate dataframes: one for nodes and one for
181
- relationships.
155
+ When data is returned from Neo4j, much of the relationship metadata is stored
156
+ with the relevant node's metadata. Consequently, the objects are organized
157
+ into two separate dataframes: one for nodes and one for relationships.
182
158
 
183
- In the node dataframe, several fields are extracted and added as new columns. Similarly, in the relationship
184
- dataframe, certain fields are parsed out, while others require metadata from the nodes dataframe.
159
+ In the node dataframe, several fields are extracted and added as new columns.
160
+ Similarly, in the relationship dataframe, certain fields are parsed out,
161
+ while others require metadata from the nodes dataframe.
185
162
 
186
- Once the data is parsed and these two dataframes are created, we combine a subset of their columns into a
187
- single dataframe, which will be used to create the DataHub objects.
163
+ Once the data is parsed and these two dataframes are created, we combine
164
+ a subset of their columns into a single dataframe, which will be used to
165
+ create the DataHub objects.
188
166
 
189
- See the docs for examples of metadata: metadata-ingestion/docs/sources/neo4j/neo4j.md
167
+ See the docs for examples of metadata:
168
+ metadata-ingestion/docs/sources/neo4j/neo4j.md
190
169
  """
191
170
  try:
192
171
  log.info(f"{query}")
@@ -201,16 +180,17 @@ class Neo4jSource(StatefulIngestionSourceBase):
201
180
 
202
181
  union_cols = ["key", "obj_type", "property_data_types", "description"]
203
182
  df = pd.concat([node_df[union_cols], rel_df[union_cols]])
183
+ return df
204
184
  except Exception as e:
205
185
  self.report.failure(
206
186
  message="Failed to get neo4j metadata",
207
187
  exc=e,
208
188
  )
209
189
 
210
- return df
190
+ return None
211
191
 
212
192
  def process_nodes(self, data: list) -> pd.DataFrame:
213
- nodes = [record for record in data if record["value"]["type"] == self.NODE]
193
+ nodes = [record for record in data if record["value"]["type"] == _NODE]
214
194
  node_df = pd.DataFrame(
215
195
  nodes,
216
196
  columns=["key", "value"],
@@ -233,9 +213,7 @@ class Neo4jSource(StatefulIngestionSourceBase):
233
213
  return node_df
234
214
 
235
215
  def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame:
236
- rels = [
237
- record for record in data if record["value"]["type"] == self.RELATIONSHIP
238
- ]
216
+ rels = [record for record in data if record["value"]["type"] == _RELATIONSHIP]
239
217
  rel_df = pd.DataFrame(rels, columns=["key", "value"])
240
218
  rel_df["obj_type"] = rel_df["value"].apply(
241
219
  lambda record: self.get_obj_type(record)
@@ -303,49 +281,40 @@ class Neo4jSource(StatefulIngestionSourceBase):
303
281
  ]
304
282
 
305
283
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
306
- df = self.get_neo4j_metadata(
307
- "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
284
+ query = (
285
+ "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key "
286
+ "RETURN key, value[key] AS value;"
308
287
  )
288
+ df = self.get_neo4j_metadata(query)
289
+ if df is None:
290
+ log.warning("No metadata retrieved from Neo4j")
291
+ return
292
+
309
293
  for _, row in df.iterrows():
310
294
  try:
311
- yield MetadataWorkUnit(
312
- id=row["key"],
313
- mcp=self.generate_neo4j_object(
314
- columns=row["property_data_types"],
315
- dataset=row["key"],
316
- ),
317
- is_primary_source=True,
295
+ dataset_obj = self.create_neo4j_dataset(
296
+ dataset=row["key"],
297
+ columns=row["property_data_types"],
298
+ obj_type=row["obj_type"],
299
+ description=row["description"],
318
300
  )
319
301
 
320
- yield MetadataWorkUnit(
321
- id=row["key"],
322
- mcp=MetadataChangeProposalWrapper(
323
- entityUrn=make_dataset_urn(
324
- platform=self.PLATFORM,
325
- name=row["key"],
326
- env=self.config.env,
327
- ),
328
- aspect=SubTypesClass(
329
- typeNames=[
330
- DatasetSubTypes.NEO4J_NODE
331
- if row["obj_type"] == self.NODE
332
- else DatasetSubTypes.NEO4J_RELATIONSHIP
333
- ]
334
- ),
335
- ),
336
- )
337
-
338
- yield MetadataWorkUnit(
339
- id=row["key"],
340
- mcp=self.add_properties(
341
- dataset=row["key"],
342
- custom_properties=None,
343
- description=row["description"],
344
- ),
345
- )
302
+ if dataset_obj:
303
+ yield from dataset_obj.as_workunits()
304
+ self.report.obj_created += 1
305
+ else:
306
+ log.warning(f"Failed to create dataset object for {row['key']}")
307
+ self.report.obj_failures += 1
346
308
 
347
309
  except Exception as e:
348
- raise e
310
+ log.warning(f"Failed to process row {row['key']}: {str(e)}")
311
+ self.report.report_warning(
312
+ title="Error processing Neo4j metadata",
313
+ message="Some entities will be missed",
314
+ context=row["key"],
315
+ exc=e,
316
+ )
317
+ self.report.obj_failures += 1
349
318
 
350
- def get_report(self):
319
+ def get_report(self) -> "Neo4jSourceReport":
351
320
  return self.report
@@ -72,7 +72,7 @@ NIFI = "nifi"
72
72
  # and here - https://github.com/psf/requests/issues/1573
73
73
  class SSLAdapter(HTTPAdapter):
74
74
  def __init__(self, certfile, keyfile, password=None):
75
- self.context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
75
+ self.context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
76
76
  self.context.load_cert_chain(
77
77
  certfile=certfile, keyfile=keyfile, password=password
78
78
  )
@@ -166,7 +166,7 @@ class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
166
166
  )
167
167
 
168
168
  @root_validator(skip_on_failure=True)
169
- def validate_auth_params(cla, values):
169
+ def validate_auth_params(cls, values):
170
170
  if values.get("auth") is NifiAuthType.CLIENT_CERT and not values.get(
171
171
  "client_cert_file"
172
172
  ):
@@ -703,7 +703,7 @@ class NifiSource(StatefulIngestionSourceBase):
703
703
  if (
704
704
  component.nifi_type is NifiType.PROCESSOR
705
705
  and component.type
706
- not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS.keys()
706
+ not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
707
707
  ) or component.nifi_type not in [
708
708
  NifiType.PROCESSOR,
709
709
  NifiType.REMOTE_INPUT_PORT,
@@ -977,7 +977,7 @@ class NifiSource(StatefulIngestionSourceBase):
977
977
  )
978
978
 
979
979
  for incoming_from in incoming:
980
- if incoming_from in self.nifi_flow.remotely_accessible_ports.keys():
980
+ if incoming_from in self.nifi_flow.remotely_accessible_ports:
981
981
  dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[incoming_from].name}"
982
982
  dataset_urn = builder.make_dataset_urn(
983
983
  NIFI, dataset_name, self.config.env
@@ -994,7 +994,7 @@ class NifiSource(StatefulIngestionSourceBase):
994
994
  )
995
995
 
996
996
  for outgoing_to in outgoing:
997
- if outgoing_to in self.nifi_flow.remotely_accessible_ports.keys():
997
+ if outgoing_to in self.nifi_flow.remotely_accessible_ports:
998
998
  dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[outgoing_to].name}"
999
999
  dataset_urn = builder.make_dataset_urn(
1000
1000
  NIFI, dataset_name, self.config.env