acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,97 @@
1
+ from typing import Dict, Optional
2
+
3
+
4
+ class TableNamingHelper:
5
+ """
6
+ Helper class for managing table naming conventions in mock data generation.
7
+
8
+ Table naming pattern: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
9
+ """
10
+
11
+ @staticmethod
12
+ def generate_table_name(
13
+ lineage_hops: int,
14
+ lineage_fan_out: int,
15
+ level: int,
16
+ table_index: int,
17
+ prefix: Optional[str] = None,
18
+ ) -> str:
19
+ """
20
+ Generate a table name following the standard naming convention.
21
+
22
+ Args:
23
+ lineage_hops: Total number of hops in the lineage graph
24
+ lineage_fan_out: Number of downstream tables per upstream table
25
+ level: Level of the table in the lineage graph (0-based)
26
+ table_index: Index of the table within its level (0-based)
27
+ prefix: Optional prefix to add to the table name
28
+
29
+ Returns:
30
+ Table name following the pattern: "{prefix}hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
31
+ """
32
+ base_name = f"hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
33
+ return f"{prefix}{base_name}" if prefix else base_name
34
+
35
+ @staticmethod
36
+ def parse_table_name(table_name: str) -> Dict[str, int]:
37
+ """
38
+ Parse a table name to extract its components.
39
+
40
+ Args:
41
+ table_name: Table name following the standard naming convention
42
+
43
+ Returns:
44
+ Dictionary containing parsed components:
45
+ - lineage_hops: Total number of hops in the lineage graph
46
+ - lineage_fan_out: Number of downstream tables per upstream table
47
+ - level: Level of the table in the lineage graph (0-based)
48
+ - table_index: Index of the table within its level (0-based)
49
+
50
+ Raises:
51
+ ValueError: If the table name doesn't follow the expected pattern
52
+ """
53
+ try:
54
+ # Expected pattern: "hops_{lineage_hops}_f_{lineage_fan_out}_h{level}_t{table_index}"
55
+ parts = table_name.split("_")
56
+
57
+ if (
58
+ len(parts) != 6
59
+ or parts[0] != "hops"
60
+ or parts[2] != "f"
61
+ or not parts[4].startswith("h")
62
+ or not parts[5].startswith("t")
63
+ ):
64
+ raise ValueError(f"Invalid table name format: {table_name}")
65
+
66
+ lineage_hops = int(parts[1])
67
+ lineage_fan_out = int(parts[3]) # lineage_fan_out is at index 3
68
+ level = int(parts[4][1:]) # Remove 'h' prefix from parts[4]
69
+ table_index = int(parts[5][1:]) # Remove 't' prefix from parts[5]
70
+
71
+ return {
72
+ "lineage_hops": lineage_hops,
73
+ "lineage_fan_out": lineage_fan_out,
74
+ "level": level,
75
+ "table_index": table_index,
76
+ }
77
+ except (ValueError, IndexError) as e:
78
+ raise ValueError(
79
+ f"Failed to parse table name '{table_name}': {str(e)}"
80
+ ) from e
81
+
82
+ @staticmethod
83
+ def is_valid_table_name(table_name: str) -> bool:
84
+ """
85
+ Check if a table name follows the expected naming convention.
86
+
87
+ Args:
88
+ table_name: Table name to validate
89
+
90
+ Returns:
91
+ True if the table name follows the expected pattern, False otherwise
92
+ """
93
+ try:
94
+ TableNamingHelper.parse_table_name(table_name)
95
+ return True
96
+ except ValueError:
97
+ return False
@@ -7,7 +7,16 @@ from dataclasses import dataclass
7
7
  from datetime import datetime, timezone
8
8
  from functools import lru_cache
9
9
  from json import JSONDecodeError
10
- from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
10
+ from typing import (
11
+ Dict,
12
+ Iterable,
13
+ Iterator,
14
+ List,
15
+ Optional,
16
+ Set,
17
+ Tuple,
18
+ Union,
19
+ )
11
20
 
12
21
  import dateutil.parser as dp
13
22
  import psutil
@@ -24,7 +33,7 @@ from requests.models import HTTPBasicAuth, HTTPError
24
33
  from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
25
34
 
26
35
  import datahub.emitter.mce_builder as builder
27
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
36
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
28
37
  from datahub.configuration.source_common import (
29
38
  DatasetLineageProviderConfigBase,
30
39
  )
@@ -200,10 +209,13 @@ class ModeConfig(
200
209
  default=True, description="Tag measures and dimensions in the schema"
201
210
  )
202
211
 
203
- items_per_page: int = Field(
204
- default=DEFAULT_API_ITEMS_PER_PAGE,
212
+ items_per_page: HiddenFromDocs[int] = Field(
213
+ DEFAULT_API_ITEMS_PER_PAGE,
205
214
  description="Number of items per page for paginated API requests.",
206
- hidden_from_docs=True,
215
+ )
216
+
217
+ exclude_archived: bool = Field(
218
+ default=False, description="Exclude archived reports"
207
219
  )
208
220
 
209
221
  @validator("connect_uri")
@@ -1465,6 +1477,15 @@ class ModeSource(StatefulIngestionSourceBase):
1465
1477
  logger.debug(
1466
1478
  f"Read {len(reports_page)} reports records from workspace {self.workspace_uri} space {space_token}"
1467
1479
  )
1480
+ if self.config.exclude_archived:
1481
+ logger.debug(
1482
+ f"Excluding archived reports since exclude_archived: {self.config.exclude_archived}"
1483
+ )
1484
+ reports_page = [
1485
+ report
1486
+ for report in reports_page
1487
+ if not report.get("archived", False)
1488
+ ]
1468
1489
  yield reports_page
1469
1490
  except ModeRequestError as e:
1470
1491
  if isinstance(e, HTTPError) and e.response.status_code == 404:
@@ -36,7 +36,10 @@ from datahub.ingestion.api.decorators import (
36
36
  )
37
37
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
38
38
  from datahub.ingestion.api.workunit import MetadataWorkUnit
39
- from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
39
+ from datahub.ingestion.source.common.subtypes import (
40
+ DatasetContainerSubTypes,
41
+ SourceCapabilityModifier,
42
+ )
40
43
  from datahub.ingestion.source.schema_inference.object import (
41
44
  SchemaDescription,
42
45
  construct_schema,
@@ -249,6 +252,13 @@ def construct_schema_pymongo(
249
252
  @support_status(SupportStatus.CERTIFIED)
250
253
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
251
254
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
255
+ @capability(
256
+ SourceCapability.CONTAINERS,
257
+ "Enabled by default",
258
+ subtype_modifier=[
259
+ SourceCapabilityModifier.DATABASE,
260
+ ],
261
+ )
252
262
  @dataclass
253
263
  class MongoDBSource(StatefulIngestionSourceBase):
254
264
  """
@@ -1,7 +1,6 @@
1
1
  import logging
2
- import time
3
2
  from dataclasses import dataclass
4
- from typing import Any, Dict, Iterable, List, Optional, Type, Union
3
+ from typing import Dict, Iterable, List, Optional, Tuple
5
4
 
6
5
  import pandas as pd
7
6
  from neo4j import GraphDatabase
@@ -11,11 +10,6 @@ from datahub.configuration.source_common import (
11
10
  EnvConfigMixin,
12
11
  PlatformInstanceConfigMixin,
13
12
  )
14
- from datahub.emitter.mce_builder import (
15
- make_data_platform_urn,
16
- make_dataset_urn_with_platform_instance,
17
- )
18
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
13
  from datahub.ingestion.api.common import PipelineContext
20
14
  from datahub.ingestion.api.decorators import (
21
15
  SupportStatus,
@@ -28,7 +22,6 @@ from datahub.ingestion.api.source import (
28
22
  MetadataWorkUnitProcessor,
29
23
  SourceCapability,
30
24
  )
31
- from datahub.ingestion.api.source_helpers import auto_workunit
32
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
33
26
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
34
27
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
@@ -40,36 +33,14 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
40
33
  StatefulIngestionReport,
41
34
  StatefulIngestionSourceBase,
42
35
  )
43
- from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaFieldDataType
44
- from datahub.metadata.schema_classes import (
45
- AuditStampClass,
46
- BooleanTypeClass,
47
- DatasetPropertiesClass,
48
- DateTypeClass,
49
- NullTypeClass,
50
- NumberTypeClass,
51
- OtherSchemaClass,
52
- SchemaFieldClass,
53
- SchemaMetadataClass,
54
- StringTypeClass,
55
- SubTypesClass,
56
- UnionTypeClass,
57
- )
36
+ from datahub.sdk.dataset import Dataset
58
37
 
59
38
  log = logging.getLogger(__name__)
60
39
  logging.basicConfig(level=logging.INFO)
61
40
 
62
- _type_mapping: Dict[Union[Type, str], Type] = {
63
- "list": UnionTypeClass,
64
- "boolean": BooleanTypeClass,
65
- "integer": NumberTypeClass,
66
- "local_date_time": DateTypeClass,
67
- "float": NumberTypeClass,
68
- "string": StringTypeClass,
69
- "date": DateTypeClass,
70
- "node": StringTypeClass,
71
- "relationship": StringTypeClass,
72
- }
41
+ # Neo4j object types
42
+ _NODE = "node"
43
+ _RELATIONSHIP = "relationship"
73
44
 
74
45
 
75
46
  class Neo4jConfig(
@@ -78,7 +49,6 @@ class Neo4jConfig(
78
49
  username: str = Field(description="Neo4j Username")
79
50
  password: str = Field(description="Neo4j Password")
80
51
  uri: str = Field(description="The URI for the Neo4j server")
81
- env: str = Field(description="Neo4j env")
82
52
 
83
53
  stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
84
54
 
@@ -96,8 +66,6 @@ class Neo4jSourceReport(StatefulIngestionReport):
96
66
  )
97
67
  @support_status(SupportStatus.CERTIFIED)
98
68
  class Neo4jSource(StatefulIngestionSourceBase):
99
- NODE = "node"
100
- RELATIONSHIP = "relationship"
101
69
  config: Neo4jConfig
102
70
  report: Neo4jSourceReport
103
71
 
@@ -113,78 +81,59 @@ class Neo4jSource(StatefulIngestionSourceBase):
113
81
  config = Neo4jConfig.parse_obj(config_dict)
114
82
  return cls(config, ctx)
115
83
 
116
- def get_field_type(self, attribute_type: Union[type, str]) -> SchemaFieldDataType:
117
- type_class: type = _type_mapping.get(attribute_type, NullTypeClass)
118
- return SchemaFieldDataType(type=type_class())
119
-
120
- def get_schema_field_class(
121
- self, col_name: str, col_type: str, **kwargs: Any
122
- ) -> SchemaFieldClass:
123
- if kwargs["obj_type"] == self.NODE and col_type == self.RELATIONSHIP:
124
- col_type = self.NODE
125
- else:
126
- col_type = col_type
127
- return SchemaFieldClass(
128
- fieldPath=col_name,
129
- type=self.get_field_type(col_type),
130
- nativeDataType=col_type,
131
- description=col_type.upper()
132
- if col_type in (self.NODE, self.RELATIONSHIP)
133
- else col_type,
134
- lastModified=AuditStampClass(
135
- time=round(time.time() * 1000), actor="urn:li:corpuser:ingestion"
136
- ),
84
+ def create_schema_field_tuple(
85
+ self, col_name: str, col_type: str, obj_type: Optional[str]
86
+ ) -> Tuple[str, str, str]:
87
+ """Convert Neo4j property to (field_name, field_type, description) tuple."""
88
+ # Special case: when a node has a relationship-typed property, treat it as a node reference
89
+ # This ensures relationship properties within nodes are described as "NODE" rather than "RELATIONSHIP"
90
+ column_type = (
91
+ _NODE if obj_type == _NODE and col_type == _RELATIONSHIP else col_type
92
+ )
93
+
94
+ description = (
95
+ column_type.upper()
96
+ if column_type in (_NODE, _RELATIONSHIP)
97
+ else column_type
137
98
  )
138
99
 
139
- def add_properties(
100
+ return (col_name, column_type, description)
101
+
102
+ def get_subtype_from_obj_type(self, obj_type: str) -> str:
103
+ """Map Neo4j object type to DataHub subtype."""
104
+ if obj_type == _NODE:
105
+ return DatasetSubTypes.NEO4J_NODE
106
+ elif obj_type == _RELATIONSHIP:
107
+ return DatasetSubTypes.NEO4J_RELATIONSHIP
108
+ return DatasetSubTypes.NEO4J_NODE # default fallback
109
+
110
+ def create_neo4j_dataset(
140
111
  self,
141
112
  dataset: str,
113
+ columns: list,
114
+ obj_type: Optional[str] = None,
142
115
  description: Optional[str] = None,
143
- custom_properties: Optional[Dict[str, str]] = None,
144
- ) -> Iterable[MetadataWorkUnit]:
145
- dataset_properties = DatasetPropertiesClass(
146
- description=description,
147
- customProperties=custom_properties,
148
- )
149
- yield MetadataChangeProposalWrapper(
150
- entityUrn=make_dataset_urn_with_platform_instance(
151
- platform=self.platform,
152
- name=dataset,
153
- platform_instance=self.config.platform_instance,
154
- env=self.config.env,
155
- ),
156
- aspect=dataset_properties,
157
- ).as_workunit()
158
-
159
- def generate_neo4j_object(
160
- self, dataset: str, columns: list, obj_type: Optional[str] = None
161
- ) -> Optional[MetadataChangeProposalWrapper]:
116
+ ) -> Optional[Dataset]:
117
+ """Create Dataset entity with Neo4j schema and metadata."""
162
118
  try:
163
- fields = [
164
- self.get_schema_field_class(key, value.lower(), obj_type=obj_type)
119
+ schema_fields = [
120
+ self.create_schema_field_tuple(
121
+ col_name=key, col_type=value.lower(), obj_type=obj_type
122
+ )
165
123
  for d in columns
166
124
  for key, value in d.items()
167
125
  ]
168
- return MetadataChangeProposalWrapper(
169
- entityUrn=make_dataset_urn_with_platform_instance(
170
- platform=self.platform,
171
- name=dataset,
172
- platform_instance=self.config.platform_instance,
173
- env=self.config.env,
174
- ),
175
- aspect=SchemaMetadataClass(
176
- schemaName=dataset,
177
- platform=make_data_platform_urn(self.platform),
178
- version=0,
179
- hash="",
180
- platformSchema=OtherSchemaClass(rawSchema=""),
181
- lastModified=AuditStampClass(
182
- time=round(time.time() * 1000),
183
- actor="urn:li:corpuser:ingestion",
184
- ),
185
- fields=fields,
186
- ),
126
+
127
+ return Dataset(
128
+ platform=self.platform,
129
+ name=dataset,
130
+ platform_instance=self.config.platform_instance,
131
+ env=self.config.env,
132
+ schema=schema_fields,
133
+ subtype=self.get_subtype_from_obj_type(obj_type or _NODE),
134
+ description=description,
187
135
  )
136
+
188
137
  except Exception as e:
189
138
  log.error(e)
190
139
  self.report.report_failure(
@@ -199,21 +148,24 @@ class Neo4jSource(StatefulIngestionSourceBase):
199
148
  self.config.uri, auth=(self.config.username, self.config.password)
200
149
  )
201
150
  """
202
- This process retrieves the metadata for Neo4j objects using an APOC query, which returns a dictionary
203
- with two columns: key and value. The key represents the Neo4j object, while the value contains the
204
- corresponding metadata.
151
+ This process retrieves the metadata for Neo4j objects using an APOC query,
152
+ which returns a dictionary with two columns: key and value. The key represents
153
+ the Neo4j object, while the value contains the corresponding metadata.
205
154
 
206
- When data is returned from Neo4j, much of the relationship metadata is stored with the relevant node's
207
- metadata. Consequently, the objects are organized into two separate dataframes: one for nodes and one for
208
- relationships.
155
+ When data is returned from Neo4j, much of the relationship metadata is stored
156
+ with the relevant node's metadata. Consequently, the objects are organized
157
+ into two separate dataframes: one for nodes and one for relationships.
209
158
 
210
- In the node dataframe, several fields are extracted and added as new columns. Similarly, in the relationship
211
- dataframe, certain fields are parsed out, while others require metadata from the nodes dataframe.
159
+ In the node dataframe, several fields are extracted and added as new columns.
160
+ Similarly, in the relationship dataframe, certain fields are parsed out,
161
+ while others require metadata from the nodes dataframe.
212
162
 
213
- Once the data is parsed and these two dataframes are created, we combine a subset of their columns into a
214
- single dataframe, which will be used to create the DataHub objects.
163
+ Once the data is parsed and these two dataframes are created, we combine
164
+ a subset of their columns into a single dataframe, which will be used to
165
+ create the DataHub objects.
215
166
 
216
- See the docs for examples of metadata: metadata-ingestion/docs/sources/neo4j/neo4j.md
167
+ See the docs for examples of metadata:
168
+ metadata-ingestion/docs/sources/neo4j/neo4j.md
217
169
  """
218
170
  try:
219
171
  log.info(f"{query}")
@@ -238,7 +190,7 @@ class Neo4jSource(StatefulIngestionSourceBase):
238
190
  return None
239
191
 
240
192
  def process_nodes(self, data: list) -> pd.DataFrame:
241
- nodes = [record for record in data if record["value"]["type"] == self.NODE]
193
+ nodes = [record for record in data if record["value"]["type"] == _NODE]
242
194
  node_df = pd.DataFrame(
243
195
  nodes,
244
196
  columns=["key", "value"],
@@ -261,9 +213,7 @@ class Neo4jSource(StatefulIngestionSourceBase):
261
213
  return node_df
262
214
 
263
215
  def process_relationships(self, data: list, node_df: pd.DataFrame) -> pd.DataFrame:
264
- rels = [
265
- record for record in data if record["value"]["type"] == self.RELATIONSHIP
266
- ]
216
+ rels = [record for record in data if record["value"]["type"] == _RELATIONSHIP]
267
217
  rel_df = pd.DataFrame(rels, columns=["key", "value"])
268
218
  rel_df["obj_type"] = rel_df["value"].apply(
269
219
  lambda record: self.get_obj_type(record)
@@ -331,51 +281,40 @@ class Neo4jSource(StatefulIngestionSourceBase):
331
281
  ]
332
282
 
333
283
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
334
- df = self.get_neo4j_metadata(
335
- "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key RETURN key, value[key] AS value;"
284
+ query = (
285
+ "CALL apoc.meta.schema() YIELD value UNWIND keys(value) AS key "
286
+ "RETURN key, value[key] AS value;"
336
287
  )
288
+ df = self.get_neo4j_metadata(query)
337
289
  if df is None:
338
290
  log.warning("No metadata retrieved from Neo4j")
339
291
  return
340
292
 
341
293
  for _, row in df.iterrows():
342
294
  try:
343
- neo4j_obj = self.generate_neo4j_object(
344
- columns=row["property_data_types"],
345
- dataset=row["key"],
346
- )
347
- if neo4j_obj:
348
- yield from auto_workunit([neo4j_obj])
349
-
350
- yield MetadataChangeProposalWrapper(
351
- entityUrn=make_dataset_urn_with_platform_instance(
352
- platform=self.platform,
353
- name=row["key"],
354
- platform_instance=self.config.platform_instance,
355
- env=self.config.env,
356
- ),
357
- aspect=SubTypesClass(
358
- typeNames=[
359
- DatasetSubTypes.NEO4J_NODE
360
- if row["obj_type"] == self.NODE
361
- else DatasetSubTypes.NEO4J_RELATIONSHIP
362
- ]
363
- ),
364
- ).as_workunit()
365
-
366
- yield from self.add_properties(
295
+ dataset_obj = self.create_neo4j_dataset(
367
296
  dataset=row["key"],
368
- custom_properties=None,
297
+ columns=row["property_data_types"],
298
+ obj_type=row["obj_type"],
369
299
  description=row["description"],
370
300
  )
371
301
 
302
+ if dataset_obj:
303
+ yield from dataset_obj.as_workunits()
304
+ self.report.obj_created += 1
305
+ else:
306
+ log.warning(f"Failed to create dataset object for {row['key']}")
307
+ self.report.obj_failures += 1
308
+
372
309
  except Exception as e:
373
- log.error(f"Failed to process row {row['key']}: {str(e)}")
374
- self.report.report_failure(
375
- message="Error processing Neo4j metadata",
310
+ log.warning(f"Failed to process row {row['key']}: {str(e)}")
311
+ self.report.report_warning(
312
+ title="Error processing Neo4j metadata",
313
+ message="Some entities will be missed",
376
314
  context=row["key"],
377
315
  exc=e,
378
316
  )
317
+ self.report.obj_failures += 1
379
318
 
380
319
  def get_report(self) -> "Neo4jSourceReport":
381
320
  return self.report
@@ -72,7 +72,7 @@ NIFI = "nifi"
72
72
  # and here - https://github.com/psf/requests/issues/1573
73
73
  class SSLAdapter(HTTPAdapter):
74
74
  def __init__(self, certfile, keyfile, password=None):
75
- self.context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
75
+ self.context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
76
76
  self.context.load_cert_chain(
77
77
  certfile=certfile, keyfile=keyfile, password=password
78
78
  )
@@ -166,7 +166,7 @@ class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
166
166
  )
167
167
 
168
168
  @root_validator(skip_on_failure=True)
169
- def validate_auth_params(cla, values):
169
+ def validate_auth_params(cls, values):
170
170
  if values.get("auth") is NifiAuthType.CLIENT_CERT and not values.get(
171
171
  "client_cert_file"
172
172
  ):
@@ -333,7 +333,7 @@ class APISource(Source, ABC):
333
333
  ),
334
334
  )
335
335
  yield wu
336
- elif endpoint_dets["method"] != "get":
336
+ elif endpoint_dets["method"] != "GET":
337
337
  self.report.report_warning(
338
338
  title="Failed to Extract Endpoint Metadata",
339
339
  message=f"No example provided for {endpoint_dets['method']}",
@@ -4,11 +4,10 @@ from enum import Enum
4
4
  from typing import Dict, List, Literal, Optional, Union
5
5
 
6
6
  import pydantic
7
- from pydantic import validator
8
- from pydantic.class_validators import root_validator
7
+ from pydantic import root_validator, validator
9
8
 
10
9
  import datahub.emitter.mce_builder as builder
11
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
12
11
  from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
13
12
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
14
13
  from datahub.ingestion.api.incremental_lineage_helper import (
@@ -291,22 +290,18 @@ class PowerBiProfilingConfig(ConfigModel):
291
290
  class PowerBiDashboardSourceConfig(
292
291
  StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
293
292
  ):
294
- platform_name: str = pydantic.Field(
295
- default=Constant.PLATFORM_NAME, hidden_from_docs=True
296
- )
293
+ platform_name: HiddenFromDocs[str] = pydantic.Field(default=Constant.PLATFORM_NAME)
297
294
 
298
- platform_urn: str = pydantic.Field(
295
+ platform_urn: HiddenFromDocs[str] = pydantic.Field(
299
296
  default=builder.make_data_platform_urn(platform=Constant.PLATFORM_NAME),
300
- hidden_from_docs=True,
301
297
  )
302
298
 
303
299
  # Organization Identifier
304
300
  tenant_id: str = pydantic.Field(description="PowerBI tenant identifier")
305
301
  # PowerBi workspace identifier
306
- workspace_id: Optional[str] = pydantic.Field(
302
+ workspace_id: HiddenFromDocs[Optional[str]] = pydantic.Field(
307
303
  default=None,
308
304
  description="[deprecated] Use workspace_id_pattern instead",
309
- hidden_from_docs=True,
310
305
  )
311
306
  # PowerBi workspace identifier
312
307
  workspace_id_pattern: AllowDenyPattern = pydantic.Field(
@@ -326,15 +321,14 @@ class PowerBiDashboardSourceConfig(
326
321
  # Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
327
322
  # DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
328
323
  # mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
329
- dataset_type_mapping: Union[Dict[str, str], Dict[str, PlatformDetail]] = (
330
- pydantic.Field(
331
- default_factory=default_for_dataset_type_mapping,
332
- description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
333
- "DataHub supported datasources."
334
- "You can configured platform instance for dataset lineage. "
335
- "See Quickstart Recipe for mapping",
336
- hidden_from_docs=True,
337
- )
324
+ dataset_type_mapping: HiddenFromDocs[
325
+ Union[Dict[str, str], Dict[str, PlatformDetail]]
326
+ ] = pydantic.Field(
327
+ default_factory=default_for_dataset_type_mapping,
328
+ description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
329
+ "DataHub supported datasources."
330
+ "You can configured platform instance for dataset lineage. "
331
+ "See Quickstart Recipe for mapping",
338
332
  )
339
333
  # PowerBI datasource's server to platform instance mapping
340
334
  server_to_platform_instance: Dict[
@@ -353,6 +347,19 @@ class PowerBiDashboardSourceConfig(
353
347
  "For example with an ODBC connection string 'DSN=database' where the database type "
354
348
  "is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
355
349
  )
350
+ # ODBC DSN to database (or database.schema) mapping
351
+ dsn_to_database_schema: Dict[str, str] = pydantic.Field(
352
+ default={},
353
+ description="A mapping of ODBC DSN to database names with optional schema names "
354
+ "(some database platforms such a MySQL use the table name pattern 'database.table', "
355
+ "while others use the pattern 'database.schema.table'). "
356
+ "This mapping is used in conjunction with ODBC SQL query parsing. "
357
+ "If SQL queries used with ODBC do not reference fully qualified tables names, "
358
+ "then you should configure mappings for your DSNs. "
359
+ "For example with an ODBC connection string 'DSN=database' where the database "
360
+ "is 'prod' you would configure the mapping as 'database: prod'. "
361
+ "If the database is 'prod' and the schema is 'data' then mapping would be 'database: prod.data'.",
362
+ )
356
363
  # deprecated warning
357
364
  _dataset_type_mapping = pydantic_field_deprecated(
358
365
  "dataset_type_mapping",
@@ -528,10 +535,9 @@ class PowerBiDashboardSourceConfig(
528
535
  "Increase this value if you encounter the 'M-Query Parsing Timeout' message in the connector report.",
529
536
  )
530
537
 
531
- metadata_api_timeout: int = pydantic.Field(
538
+ metadata_api_timeout: HiddenFromDocs[int] = pydantic.Field(
532
539
  default=30,
533
540
  description="timeout in seconds for Metadata Rest Api.",
534
- hidden_from_docs=True,
535
541
  )
536
542
 
537
543
  @root_validator(skip_on_failure=True)
@@ -614,3 +620,23 @@ class PowerBiDashboardSourceConfig(
614
620
  "Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
615
621
  )
616
622
  return values
623
+
624
+ @root_validator(skip_on_failure=True)
625
+ def validate_dsn_to_database_schema(cls, values: Dict) -> Dict:
626
+ if values.get("dsn_to_database_schema") is not None:
627
+ dsn_mapping = values.get("dsn_to_database_schema")
628
+ if not isinstance(dsn_mapping, dict):
629
+ raise ValueError("dsn_to_database_schema must contain key-value pairs")
630
+
631
+ for _key, value in dsn_mapping.items():
632
+ if not isinstance(value, str):
633
+ raise ValueError(
634
+ "dsn_to_database_schema mapping values must be strings"
635
+ )
636
+ parts = value.split(".")
637
+ if len(parts) != 1 and len(parts) != 2:
638
+ raise ValueError(
639
+ f"dsn_to_database_schema invalid mapping value: {value}"
640
+ )
641
+
642
+ return values
@@ -76,3 +76,4 @@ class FunctionName(Enum):
76
76
  DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
77
77
  MYSQL_DATA_ACCESS = "MySQL.Database"
78
78
  ODBC_DATA_ACCESS = "Odbc.DataSource"
79
+ ODBC_QUERY = "Odbc.Query"