acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,355 @@
1
+ from typing import Iterable, List, Optional
2
+
3
+ from datahub.emitter.mce_builder import (
4
+ make_data_flow_urn,
5
+ make_data_job_urn,
6
+ make_data_platform_urn,
7
+ make_dataset_urn_with_platform_instance,
8
+ make_schema_field_urn,
9
+ )
10
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
11
+ from datahub.ingestion.api.common import PipelineContext
12
+ from datahub.ingestion.api.decorators import (
13
+ SupportStatus,
14
+ capability,
15
+ config_class,
16
+ platform_name,
17
+ support_status,
18
+ )
19
+ from datahub.ingestion.api.source import (
20
+ MetadataWorkUnitProcessor,
21
+ SourceCapability,
22
+ SourceReport,
23
+ )
24
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
25
+ from datahub.ingestion.graph.client import DataHubGraph
26
+ from datahub.ingestion.source.snaplogic.snaplogic_config import SnaplogicConfig
27
+ from datahub.ingestion.source.snaplogic.snaplogic_lineage_extractor import (
28
+ SnaplogicLineageExtractor,
29
+ )
30
+ from datahub.ingestion.source.snaplogic.snaplogic_parser import (
31
+ ColumnMapping,
32
+ Dataset,
33
+ SnapLogicParser,
34
+ )
35
+ from datahub.ingestion.source.snaplogic.snaplogic_utils import SnaplogicUtils
36
+ from datahub.ingestion.source.state.redundant_run_skip_handler import (
37
+ RedundantLineageRunSkipHandler,
38
+ )
39
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
40
+ StaleEntityRemovalHandler,
41
+ StaleEntityRemovalSourceReport,
42
+ )
43
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
44
+ StatefulIngestionSourceBase,
45
+ )
46
+ from datahub.metadata.schema_classes import (
47
+ DataFlowInfoClass,
48
+ DataJobInfoClass,
49
+ DataJobInputOutputClass,
50
+ DatasetPropertiesClass,
51
+ FineGrainedLineageClass,
52
+ FineGrainedLineageDownstreamTypeClass,
53
+ OtherSchemaClass,
54
+ SchemaFieldClass,
55
+ SchemaMetadataClass,
56
+ )
57
+
58
+
59
+ @platform_name("SnapLogic")
60
+ @config_class(SnaplogicConfig)
61
+ @support_status(SupportStatus.TESTING)
62
+ @capability(
63
+ SourceCapability.PLATFORM_INSTANCE,
64
+ "SnapLogic does not support platform instances",
65
+ supported=False,
66
+ )
67
+ @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
68
+ @capability(SourceCapability.LINEAGE_FINE, "Enabled by default")
69
+ @capability(SourceCapability.DELETION_DETECTION, "Not supported yet", supported=False)
70
+ class SnaplogicSource(StatefulIngestionSourceBase):
71
+ """
72
+ A source plugin for ingesting lineage and metadata from SnapLogic.
73
+ """
74
+
75
+ def __init__(self, config: SnaplogicConfig, ctx: PipelineContext):
76
+ super().__init__(config, ctx)
77
+ self.config = config
78
+ self.report = StaleEntityRemovalSourceReport()
79
+ self.graph: Optional[DataHubGraph] = ctx.graph
80
+ self.snaplogic_parser = SnapLogicParser(
81
+ config.case_insensitive_namespaces, self.config.namespace_mapping
82
+ )
83
+ self.redundant_lineage_run_skip_handler: Optional[
84
+ RedundantLineageRunSkipHandler
85
+ ] = None
86
+ if self.config.enable_stateful_lineage_ingestion:
87
+ self.redundant_lineage_run_skip_handler = RedundantLineageRunSkipHandler(
88
+ source=self,
89
+ config=self.config,
90
+ pipeline_name=ctx.pipeline_name,
91
+ run_id=ctx.run_id,
92
+ )
93
+ self.snaplogic_lineage_extractor = SnaplogicLineageExtractor(
94
+ config=config,
95
+ redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
96
+ report=self.report,
97
+ )
98
+
99
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
100
+ try:
101
+ self.report.info(
102
+ message="Starting lineage ingestion from SnapLogic",
103
+ title="Lineage Ingestion",
104
+ )
105
+
106
+ records_processed = 0
107
+ for lineage in self.snaplogic_lineage_extractor.get_lineages():
108
+ try:
109
+ for workunit in self._process_lineage_record(lineage):
110
+ yield workunit
111
+ records_processed += 1
112
+
113
+ if records_processed % 20 == 0:
114
+ self.report.info(
115
+ message=f"Processed {records_processed} lineage records",
116
+ title="Lineage Ingestion Progress",
117
+ )
118
+ except Exception as e:
119
+ self.report.report_failure(
120
+ message="Failed to process lineage record",
121
+ context=str(lineage),
122
+ exc=e,
123
+ )
124
+ self.report.info(
125
+ message=f"Completed processing {records_processed} lineage records",
126
+ title="Lineage Ingestion Complete",
127
+ )
128
+ self.snaplogic_lineage_extractor.report_status("lineage_ingestion", True)
129
+ self.snaplogic_lineage_extractor.update_stats()
130
+ except Exception as e:
131
+ self.report.report_failure(message="Failed to fetch lineages", exc=e)
132
+ self.snaplogic_lineage_extractor.report_status("lineage_ingestion", False)
133
+
134
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
135
+ return [
136
+ *super().get_workunit_processors(),
137
+ StaleEntityRemovalHandler.create(
138
+ self, self.config, self.ctx
139
+ ).workunit_processor,
140
+ ]
141
+
142
+ def _process_lineage_record(self, lineage: dict) -> Iterable[MetadataWorkUnit]:
143
+ """Process a lineage record to create pipeline and task workunits with relationships."""
144
+ producer = lineage.get("producer")
145
+ if not producer:
146
+ return
147
+ pipeline_snode_id = producer.split("#pipe_snode=")[1]
148
+ if not pipeline_snode_id:
149
+ return
150
+ datasets = self.snaplogic_parser.extract_datasets_from_lineage(lineage)
151
+ pipeline = self.snaplogic_parser.extract_pipeline_from_lineage(lineage)
152
+ task = self.snaplogic_parser.extract_task_from_lineage(lineage)
153
+ columns_mapping = self.snaplogic_parser.extract_columns_mapping_from_lineage(
154
+ lineage
155
+ )
156
+
157
+ # Create pipeline MCP
158
+ for pipeline_workunit in self.create_pipeline_mcp(
159
+ name=pipeline.name,
160
+ pipeline_snode_id=pipeline.id,
161
+ namespace=pipeline.namespace,
162
+ ):
163
+ self.report.report_workunit(pipeline_workunit)
164
+ yield pipeline_workunit
165
+
166
+ # Create dataset MCP
167
+ for dataset in datasets:
168
+ for dataset_workunit in self.create_dataset_mcp(
169
+ dataset_name=dataset.name,
170
+ dataset_display_name=dataset.display_name,
171
+ fields=dataset.fields,
172
+ platform=dataset.platform,
173
+ platform_instance=dataset.platform_instance,
174
+ ):
175
+ self.report.report_workunit(dataset_workunit)
176
+ yield dataset_workunit
177
+
178
+ # Create task MCP
179
+ for task_workunit in self.create_task_mcp(
180
+ name=task.name,
181
+ task_id=task.id,
182
+ namespace=task.namespace,
183
+ pipeline_snode_id=pipeline_snode_id,
184
+ input_datasets=[dataset for dataset in datasets if dataset.type == "INPUT"],
185
+ output_datasets=[
186
+ dataset for dataset in datasets if dataset.type == "OUTPUT"
187
+ ],
188
+ columns_mapping=columns_mapping,
189
+ ):
190
+ self.report.report_workunit(task_workunit)
191
+ yield task_workunit
192
+
193
+ def create_task_mcp(
194
+ self,
195
+ task_id: str,
196
+ name: str,
197
+ namespace: str,
198
+ pipeline_snode_id: str,
199
+ input_datasets: list[Dataset],
200
+ output_datasets: list[Dataset],
201
+ columns_mapping: list[ColumnMapping],
202
+ ) -> Iterable[MetadataWorkUnit]:
203
+ """Create MCPs for a task (snap) including metadata and lineage."""
204
+ job_urn = make_data_job_urn(
205
+ orchestrator=namespace,
206
+ flow_id=pipeline_snode_id,
207
+ job_id=task_id,
208
+ cluster="PROD",
209
+ )
210
+ yield MetadataChangeProposalWrapper(
211
+ entityUrn=job_urn,
212
+ aspect=DataJobInfoClass(
213
+ name=name,
214
+ description="",
215
+ externalUrl=f"{self.config.base_url}/sl/designer.html?v=21818#pipe_snode={pipeline_snode_id}",
216
+ type="SNAPLOGIC_SNAP",
217
+ ),
218
+ ).as_workunit()
219
+
220
+ # Helper functions
221
+ def dataset_urn(d: Dataset) -> str:
222
+ return make_dataset_urn_with_platform_instance(
223
+ d.platform, d.name, d.platform_instance
224
+ )
225
+
226
+ def field_urn(d, f):
227
+ return make_schema_field_urn(dataset_urn(d), f["name"])
228
+
229
+ # Emit lineage
230
+ yield MetadataChangeProposalWrapper(
231
+ entityUrn=job_urn,
232
+ aspect=DataJobInputOutputClass(
233
+ inputDatasets=[dataset_urn(d) for d in input_datasets],
234
+ outputDatasets=[dataset_urn(d) for d in output_datasets],
235
+ inputDatasetFields=[
236
+ field_urn(d, f) for d in input_datasets for f in d.fields
237
+ ],
238
+ outputDatasetFields=[
239
+ field_urn(d, f) for d in output_datasets for f in d.fields
240
+ ],
241
+ fineGrainedLineages=[
242
+ FineGrainedLineageClass(
243
+ upstreamType=FineGrainedLineageDownstreamTypeClass.FIELD_SET,
244
+ upstreams=[
245
+ make_schema_field_urn(
246
+ make_dataset_urn_with_platform_instance(
247
+ cl.input_dataset.platform,
248
+ cl.input_dataset.name,
249
+ cl.input_dataset.platform_instance,
250
+ cl.input_dataset.env,
251
+ ),
252
+ cl.input_field,
253
+ )
254
+ ],
255
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD_SET,
256
+ downstreams=[
257
+ make_schema_field_urn(
258
+ make_dataset_urn_with_platform_instance(
259
+ cl.output_dataset.platform,
260
+ cl.output_dataset.name,
261
+ cl.output_dataset.platform_instance,
262
+ cl.output_dataset.env,
263
+ ),
264
+ cl.output_field,
265
+ )
266
+ ],
267
+ )
268
+ for cl in columns_mapping
269
+ ],
270
+ ),
271
+ ).as_workunit()
272
+
273
+ def create_dataset_mcp(
274
+ self,
275
+ dataset_name: str,
276
+ dataset_display_name: str,
277
+ fields: list[dict],
278
+ platform: str = "snaplogic",
279
+ env: str = "PROD",
280
+ platform_instance: Optional[str] = None,
281
+ ) -> Iterable[MetadataWorkUnit]:
282
+ dataset_urn = make_dataset_urn_with_platform_instance(
283
+ platform=platform,
284
+ name=dataset_name,
285
+ env=env,
286
+ platform_instance=platform_instance,
287
+ )
288
+
289
+ # Skip dataset creation if:
290
+ # 1. The platform is not "snaplogic" AND
291
+ # 2. Either:
292
+ # a) The config `create_non_snaplogic_datasets` is disabled (False), meaning
293
+ # we do not create datasets for non-snaplogic platforms, OR
294
+ # b) The dataset already exists in DataHub (`self.graph.exists(dataset_urn)`).
295
+ if platform != "snaplogic" and (
296
+ not self.config.create_non_snaplogic_datasets
297
+ or (self.graph and self.graph.exists(dataset_urn))
298
+ ):
299
+ return
300
+
301
+ dataset_properties = DatasetPropertiesClass(
302
+ name=dataset_display_name,
303
+ qualifiedName=dataset_name,
304
+ )
305
+ schema_fields = [
306
+ SchemaFieldClass(
307
+ fieldPath=field["name"],
308
+ type=SnaplogicUtils.get_datahub_type(field.get("type", "Varchar")),
309
+ nativeDataType=field.get("type", "Varchar"),
310
+ )
311
+ for field in fields
312
+ ]
313
+ schema_metadata = SchemaMetadataClass(
314
+ schemaName=dataset_name,
315
+ platform=make_data_platform_urn(platform),
316
+ version=0,
317
+ hash="",
318
+ platformSchema=OtherSchemaClass(rawSchema=""),
319
+ fields=schema_fields,
320
+ )
321
+
322
+ yield MetadataChangeProposalWrapper(
323
+ entityUrn=dataset_urn, aspect=dataset_properties
324
+ ).as_workunit()
325
+
326
+ yield MetadataChangeProposalWrapper(
327
+ entityUrn=dataset_urn, aspect=schema_metadata
328
+ ).as_workunit()
329
+
330
+ def create_pipeline_mcp(
331
+ self, name: str, namespace: str, pipeline_snode_id: str
332
+ ) -> Iterable[MetadataWorkUnit]:
333
+ flow_urn = make_data_flow_urn(
334
+ orchestrator=namespace, flow_id=pipeline_snode_id, cluster="PROD"
335
+ )
336
+
337
+ yield MetadataChangeProposalWrapper(
338
+ entityUrn=flow_urn,
339
+ aspect=DataFlowInfoClass(
340
+ name=name,
341
+ description="",
342
+ externalUrl=f"{self.config.base_url}/sl/designer.html?v=21818#pipe_snode={pipeline_snode_id}",
343
+ ),
344
+ ).as_workunit()
345
+
346
+ def get_report(self) -> SourceReport:
347
+ return self.report
348
+
349
+ def close(self) -> None:
350
+ super().close()
351
+
352
+ @classmethod
353
+ def create(cls, config_dict: dict, ctx: PipelineContext) -> "SnaplogicSource":
354
+ config = SnaplogicConfig.parse_obj(config_dict)
355
+ return cls(config, ctx)
@@ -0,0 +1,37 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import Field, SecretStr
4
+
5
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
+ StatefulIngestionConfigBase,
7
+ StatefulStaleMetadataRemovalConfig,
8
+ )
9
+ from datahub.ingestion.source.state.stateful_ingestion_base import (
10
+ StatefulLineageConfigMixin,
11
+ StatefulUsageConfigMixin,
12
+ )
13
+
14
+
15
+ class SnaplogicConfig(
16
+ StatefulIngestionConfigBase, StatefulLineageConfigMixin, StatefulUsageConfigMixin
17
+ ):
18
+ platform: str = "SnapLogic"
19
+ username: str = Field(description="Username")
20
+ password: SecretStr = Field(description="Password")
21
+ base_url: str = Field(
22
+ default="https://elastic.snaplogic.com",
23
+ description="Url to your SnapLogic instance: `https://elastic.snaplogic.com`, or similar. Used for making API calls to SnapLogic.",
24
+ )
25
+ org_name: str = Field(description="Organization name from SnapLogic instance")
26
+ namespace_mapping: dict = Field(
27
+ default={}, description="Mapping of namespaces to platform instances"
28
+ )
29
+ case_insensitive_namespaces: list = Field(
30
+ default=[],
31
+ description="List of namespaces that should be treated as case insensitive",
32
+ )
33
+ create_non_snaplogic_datasets: bool = Field(
34
+ default=False,
35
+ description="Whether to create datasets for non-SnapLogic datasets (e.g., databases, S3, etc.)",
36
+ )
37
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
@@ -0,0 +1,107 @@
1
+ from datetime import datetime
2
+ from typing import Iterable, Optional, Tuple
3
+
4
+ import requests
5
+
6
+ from datahub.ingestion.api.source import (
7
+ SourceReport,
8
+ )
9
+ from datahub.ingestion.source.snaplogic.snaplogic_config import SnaplogicConfig
10
+ from datahub.ingestion.source.state.redundant_run_skip_handler import (
11
+ RedundantLineageRunSkipHandler,
12
+ )
13
+
14
+
15
+ class SnaplogicLineageExtractor:
16
+ """
17
+ A class to interact with the SnapLogic API.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ config: SnaplogicConfig,
23
+ redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler],
24
+ report: SourceReport,
25
+ ):
26
+ self.config = config
27
+ self.report = report
28
+ self.redundant_run_skip_handler = redundant_run_skip_handler
29
+ self.start_time, self.end_time = self._get_time_window()
30
+
31
+ def get_lineages(self) -> Iterable[dict]:
32
+ """Generator function that yields lineage records one at a time as they are fetched."""
33
+ page = 0
34
+ has_more = True
35
+ records_processed = 0
36
+
37
+ try:
38
+ while has_more:
39
+ params = {
40
+ "format": "OPENLINEAGE",
41
+ "start_ts": str(int(self.start_time.timestamp() * 1000)),
42
+ "end_ts": str(int(self.end_time.timestamp() * 1000)),
43
+ "page": str(page),
44
+ }
45
+
46
+ self.report.info(
47
+ message=f"Fetching lineage data - page: {page}, start_ts: {self.start_time}, end_ts: {self.end_time}",
48
+ title="Lineage Fetch",
49
+ )
50
+ headers = {"User-Agent": "datahub-connector/1.0"}
51
+ response = requests.get(
52
+ url=f"{self.config.base_url}/api/1/rest/public/catalog/{self.config.org_name}/lineage",
53
+ params=params,
54
+ headers=headers,
55
+ auth=(
56
+ self.config.username,
57
+ self.config.password.get_secret_value(),
58
+ ),
59
+ )
60
+ response.raise_for_status()
61
+
62
+ data = response.json()
63
+ content = data["content"]
64
+
65
+ # Yield records one at a time
66
+ for record in content:
67
+ records_processed += 1
68
+ yield record
69
+
70
+ # Check if we need to fetch more pages
71
+ has_more = (
72
+ len(content) >= 20
73
+ ) # If we got full page size, there might be more
74
+ page += 1
75
+
76
+ self.report.info(
77
+ message=f"Completed fetching lineage data. Total records processed: {records_processed}",
78
+ title="Lineage Fetch Complete",
79
+ )
80
+
81
+ except Exception as e:
82
+ self.report.report_failure(
83
+ message="Error fetching lineage data",
84
+ exc=e,
85
+ title="Lineage Fetch Error",
86
+ )
87
+ raise
88
+
89
+ def _get_time_window(self) -> Tuple[datetime, datetime]:
90
+ if self.redundant_run_skip_handler:
91
+ return self.redundant_run_skip_handler.suggest_run_time_window(
92
+ self.config.start_time, self.config.end_time
93
+ )
94
+ else:
95
+ return self.config.start_time, self.config.end_time
96
+
97
+ def update_stats(self):
98
+ if self.redundant_run_skip_handler:
99
+ # Update the checkpoint state for this run.
100
+ self.redundant_run_skip_handler.update_state(
101
+ self.config.start_time,
102
+ self.config.end_time,
103
+ )
104
+
105
+ def report_status(self, step: str, status: bool) -> None:
106
+ if self.redundant_run_skip_handler:
107
+ self.redundant_run_skip_handler.report_current_run_status(step, status)
@@ -0,0 +1,168 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Dict, List, Optional
3
+
4
+
5
+ @dataclass
6
+ class Dataset:
7
+ name: str
8
+ display_name: str
9
+ fields: List[Dict] = field(default_factory=list)
10
+ platform: str = "snaplogic"
11
+ platform_instance: Optional[str] = None
12
+ type: Optional[str] = None # INPUT or OUTPUT
13
+ env: str = "PROD"
14
+
15
+
16
+ @dataclass
17
+ class Pipeline:
18
+ name: str
19
+ id: str
20
+ namespace: str
21
+
22
+
23
+ @dataclass
24
+ class Task:
25
+ name: str
26
+ id: str
27
+ namespace: str
28
+
29
+
30
+ @dataclass
31
+ class ColumnMapping:
32
+ input_dataset: Dataset
33
+ output_dataset: Dataset
34
+ input_field: str
35
+ output_field: str
36
+
37
+
38
+ class SnapLogicParser:
39
+ def __init__(self, case_insensitive_namespaces: list[str], namespace_mapping: dict):
40
+ self.case_insensitive_namespaces = case_insensitive_namespaces
41
+ self.namespace_mapping = namespace_mapping
42
+ self.platform_mapping = {
43
+ "sqlserver": "mssql",
44
+ }
45
+
46
+ def _parse_platform(self, namespace: str) -> str:
47
+ type_part = namespace.split("://")[0] if "://" in namespace else namespace
48
+
49
+ return self.platform_mapping.get(type_part.lower(), type_part.lower())
50
+
51
+ def extract_task_from_lineage(self, lineage: dict) -> Task:
52
+ job = lineage.get("job")
53
+ if not job:
54
+ raise ValueError("Job information is missing in the lineage data.")
55
+ name = job.get("name")
56
+ namespace = job.get("namespace")
57
+
58
+ return Task(
59
+ id=name,
60
+ name=name.rsplit(":", 1)[0],
61
+ namespace=self._parse_platform(namespace),
62
+ )
63
+
64
+ def extract_pipeline_from_lineage(self, lineage: dict) -> Pipeline:
65
+ parent_run = lineage.get("run", {}).get("facets", {}).get("parent", {})
66
+ job = parent_run.get("job", {})
67
+ name = job.get("name")
68
+ namespace = job.get("namespace")
69
+ pipeline_snode_id = parent_run.get("_producer").split("#pipe_snode=")[1]
70
+ return Pipeline(
71
+ id=pipeline_snode_id, name=name, namespace=self._parse_platform(namespace)
72
+ )
73
+
74
+ def _get_case_sensitive_value(self, value: str, namespace: str) -> str:
75
+ """Transform value to lowercase if namespace is case-insensitive."""
76
+ return value.lower() if namespace in self.case_insensitive_namespaces else value
77
+
78
+ def _create_dataset_info(
79
+ self,
80
+ namespace: str,
81
+ name: str,
82
+ display_name: str,
83
+ type: str,
84
+ fields: Optional[List[Dict]] = None,
85
+ ) -> Dataset:
86
+ """Create a Dataset instance with proper case sensitivity."""
87
+ return Dataset(
88
+ platform=self._parse_platform(namespace),
89
+ name=self._get_case_sensitive_value(name, namespace),
90
+ display_name=display_name or name,
91
+ fields=fields or [],
92
+ env="PROD",
93
+ platform_instance=self.namespace_mapping.get(namespace, None),
94
+ type=type,
95
+ )
96
+
97
+ def extract_columns_mapping_from_lineage(
98
+ self, lineage: dict
99
+ ) -> List[ColumnMapping]:
100
+ outputs = lineage.get("outputs", [])
101
+ lineages = []
102
+
103
+ for output in outputs:
104
+ output_namespace = output.get("namespace")
105
+ output_name = output.get("name", "")
106
+ column_lineage = (
107
+ output.get("facets", {}).get("columnLineage", {}).get("fields", {})
108
+ )
109
+
110
+ for field_name, field_dict in column_lineage.items():
111
+ output_field = self._get_case_sensitive_value(
112
+ field_name, output_namespace
113
+ )
114
+
115
+ for input_field in field_dict.get("inputFields", []):
116
+ input_namespace = input_field.get("namespace")
117
+ input_name = input_field.get("name", "")
118
+ input_field_name = input_field.get("field", "")
119
+
120
+ lineages.append(
121
+ ColumnMapping(
122
+ input_dataset=self._create_dataset_info(
123
+ input_namespace, input_name, input_name, "INPUT"
124
+ ),
125
+ output_dataset=self._create_dataset_info(
126
+ output_namespace, output_name, output_name, "OUTPUT"
127
+ ),
128
+ input_field=self._get_case_sensitive_value(
129
+ input_field_name, input_namespace
130
+ ),
131
+ output_field=output_field,
132
+ )
133
+ )
134
+
135
+ return lineages
136
+
137
+ def extract_datasets_from_lineage(self, lineage: dict) -> List[Dataset]:
138
+ inputs = lineage.get("inputs", {})
139
+ outputs = lineage.get("outputs", {})
140
+
141
+ datasets = []
142
+ for dataset, dataset_type in [
143
+ *[(input_dataset, "INPUT") for input_dataset in inputs],
144
+ *[(output_dataset, "OUTPUT") for output_dataset in outputs],
145
+ ]:
146
+ namespace = dataset.get("namespace")
147
+ name = dataset.get("name", "")
148
+ fields = dataset.get("facets", {}).get("schema", {}).get("fields", [])
149
+ display_name = name
150
+
151
+ # Transform names to lowercase if namespace is in case_insensitive_namespaces
152
+ if namespace in self.case_insensitive_namespaces:
153
+ name = name.lower()
154
+ fields = [
155
+ {**field, "name": field.get("name", "").lower()} for field in fields
156
+ ]
157
+
158
+ datasets.append(
159
+ self._create_dataset_info(
160
+ namespace=namespace,
161
+ name=name,
162
+ fields=fields,
163
+ display_name=display_name,
164
+ type=dataset_type,
165
+ )
166
+ )
167
+
168
+ return datasets
@@ -0,0 +1,31 @@
1
+ from datahub.metadata.schema_classes import (
2
+ BooleanTypeClass,
3
+ NumberTypeClass,
4
+ SchemaFieldDataTypeClass,
5
+ StringTypeClass,
6
+ )
7
+
8
+
9
+ class SnaplogicUtils:
10
+ @staticmethod
11
+ def get_datahub_type(type_str: str) -> SchemaFieldDataTypeClass:
12
+ """
13
+ Maps a string-based type to a DataHub SchemaFieldDataTypeClass.
14
+
15
+ Args:
16
+ type_str (str): The input type (e.g., "string", "int", "boolean").
17
+
18
+ Returns:
19
+ SchemaFieldDataTypeClass: The mapped DataHub type.
20
+ """
21
+ normalized_type = type_str.lower()
22
+
23
+ if normalized_type in ["string", "varchar"]:
24
+ return SchemaFieldDataTypeClass(type=StringTypeClass())
25
+ elif normalized_type in ["number", "long", "float", "double", "int"]:
26
+ return SchemaFieldDataTypeClass(type=NumberTypeClass())
27
+ elif normalized_type == "boolean":
28
+ return SchemaFieldDataTypeClass(type=BooleanTypeClass())
29
+ else:
30
+ # Default fallback: String
31
+ return SchemaFieldDataTypeClass(type=StringTypeClass())