acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
+ import pathlib
1
2
  from dataclasses import dataclass
2
- from typing import Dict, Iterable, List, Optional, Tuple, Union
3
+ from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Union
3
4
 
4
5
  import feast.types
5
6
  from feast import (
@@ -97,7 +98,7 @@ class FeastRepositorySourceConfig(
97
98
  StatefulIngestionConfigBase,
98
99
  ):
99
100
  path: str = Field(description="Path to Feast repository")
100
- fs_yaml_file: Optional[str] = Field(
101
+ fs_yaml_file: Optional[pathlib.Path] = Field(
101
102
  default=None,
102
103
  description="Path to the `feature_store.yaml` file used to configure the feature store",
103
104
  )
@@ -135,24 +136,21 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
135
136
  """
136
137
  This plugin extracts:
137
138
 
138
- - Entities as [`MLPrimaryKey`](https://datahubproject.io/docs/graphql/objects#mlprimarykey)
139
- - Fields as [`MLFeature`](https://datahubproject.io/docs/graphql/objects#mlfeature)
140
- - Feature views and on-demand feature views as [`MLFeatureTable`](https://datahubproject.io/docs/graphql/objects#mlfeaturetable)
141
- - Batch and stream source details as [`Dataset`](https://datahubproject.io/docs/graphql/objects#dataset)
139
+ - Entities as [`MLPrimaryKey`](https://docs.datahub.com/docs/graphql/objects#mlprimarykey)
140
+ - Fields as [`MLFeature`](https://docs.datahub.com/docs/graphql/objects#mlfeature)
141
+ - Feature views and on-demand feature views as [`MLFeatureTable`](https://docs.datahub.com/docs/graphql/objects#mlfeaturetable)
142
+ - Batch and stream source details as [`Dataset`](https://docs.datahub.com/docs/graphql/objects#dataset)
142
143
  - Column types associated with each entity and feature
143
144
  """
144
145
 
145
- platform = "feast"
146
- source_config: FeastRepositorySourceConfig
147
- report: StaleEntityRemovalSourceReport
148
- feature_store: FeatureStore
146
+ platform: ClassVar[str] = "feast"
149
147
 
150
148
  def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
151
149
  super().__init__(config, ctx)
152
- self.source_config = config
153
- self.ctx = ctx
154
- self.report = StaleEntityRemovalSourceReport()
155
- self.feature_store = FeatureStore(
150
+ self.source_config: FeastRepositorySourceConfig = config
151
+ self.ctx: PipelineContext = ctx
152
+ self.report: StaleEntityRemovalSourceReport = StaleEntityRemovalSourceReport()
153
+ self.feature_store: FeatureStore = FeatureStore(
156
154
  repo_path=self.source_config.path,
157
155
  fs_yaml_file=self.source_config.fs_yaml_file,
158
156
  )
@@ -18,7 +18,9 @@ from datahub.configuration.validate_field_rename import pydantic_renamed_field
18
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
19
  from datahub.ingestion.api.common import PipelineContext
20
20
  from datahub.ingestion.api.decorators import (
21
+ SourceCapability,
21
22
  SupportStatus,
23
+ capability,
22
24
  config_class,
23
25
  platform_name,
24
26
  support_status,
@@ -187,6 +189,7 @@ class FileSourceReport(StaleEntityRemovalSourceReport):
187
189
  @platform_name("Metadata File")
188
190
  @config_class(FileSourceConfig)
189
191
  @support_status(SupportStatus.CERTIFIED)
192
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
190
193
  class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
191
194
  """
192
195
  This plugin pulls metadata from a previously generated file.
@@ -16,7 +16,7 @@ from datahub.configuration.source_common import DatasetSourceConfigMixin
16
16
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
17
17
  from datahub.emitter.mce_builder import DEFAULT_ENV
18
18
  from datahub.ingestion.api.report import Report
19
- from datahub.ingestion.source.bigquery_v2.bigquery_config import (
19
+ from datahub.ingestion.source.bigquery_v2.bigquery_connection import (
20
20
  BigQueryConnectionConfig,
21
21
  )
22
22
  from datahub.ingestion.source.snowflake.snowflake_connection import (
@@ -29,6 +29,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
29
29
  from datahub.ingestion.source.state.stateful_ingestion_base import (
30
30
  StatefulIngestionConfigBase,
31
31
  )
32
+ from datahub.ingestion.source.unity.connection import UnityCatalogConnectionConfig
32
33
  from datahub.utilities.lossy_collections import LossyList
33
34
  from datahub.utilities.perf_timer import PerfTimer
34
35
 
@@ -56,8 +57,8 @@ class Constant:
56
57
  STATUS = "status"
57
58
  USER_ID = "user_id"
58
59
  EMAIL = "email"
59
- CONNECTOR_ID = "connector_id"
60
- CONNECTOR_NAME = "connector_name"
60
+ CONNECTOR_ID = "connection_id"
61
+ CONNECTOR_NAME = "connection_name"
61
62
  CONNECTOR_TYPE_ID = "connector_type_id"
62
63
  PAUSED = "paused"
63
64
  SYNC_FREQUENCY = "sync_frequency"
@@ -67,13 +68,22 @@ class Constant:
67
68
  SUCCESSFUL = "SUCCESSFUL"
68
69
  FAILURE_WITH_TASK = "FAILURE_WITH_TASK"
69
70
  CANCELED = "CANCELED"
71
+ GOOGLE_SHEETS_CONNECTOR_TYPE = "google_sheets"
70
72
 
71
73
 
74
+ # Key: Connector Type, Value: Platform ID/Name
72
75
  KNOWN_DATA_PLATFORM_MAPPING = {
76
+ "google_cloud_postgresql": "postgres",
73
77
  "postgres": "postgres",
74
78
  "snowflake": "snowflake",
79
+ Constant.GOOGLE_SHEETS_CONNECTOR_TYPE: Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
75
80
  }
76
81
 
82
+ # Note: (As of Oct 2025) Fivetran Platform Connector has stale lineage metadata for Google Sheets column data (deleted/renamed).
83
+ # Ref: https://fivetran.com/docs/connectors/files/google-sheets#deletingdata
84
+ # TODO: Remove Google Sheets connector type from DISABLE_LINEAGE_FOR_CONNECTOR_TYPES
85
+ DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES = [Constant.GOOGLE_SHEETS_CONNECTOR_TYPE]
86
+
77
87
 
78
88
  class SnowflakeDestinationConfig(SnowflakeConnectionConfig):
79
89
  database: str = Field(description="The fivetran connector log database.")
@@ -84,10 +94,34 @@ class BigQueryDestinationConfig(BigQueryConnectionConfig):
84
94
  dataset: str = Field(description="The fivetran connector log dataset.")
85
95
 
86
96
 
97
+ class DatabricksDestinationConfig(UnityCatalogConnectionConfig):
98
+ catalog: str = Field(description="The fivetran connector log catalog.")
99
+ log_schema: str = Field(description="The fivetran connector log schema.")
100
+
101
+ @pydantic.validator("warehouse_id")
102
+ def warehouse_id_should_not_be_empty(cls, warehouse_id: Optional[str]) -> str:
103
+ if warehouse_id is None or (warehouse_id and warehouse_id.strip() == ""):
104
+ raise ValueError("Fivetran requires warehouse_id to be set")
105
+ return warehouse_id
106
+
107
+
108
+ class FivetranAPIConfig(ConfigModel):
109
+ api_key: str = Field(description="Fivetran API key")
110
+ api_secret: str = Field(description="Fivetran API secret")
111
+ base_url: str = Field(
112
+ default="https://api.fivetran.com", description="Fivetran API base URL"
113
+ )
114
+ request_timeout_sec: int = Field(
115
+ default=30, description="Request timeout in seconds"
116
+ )
117
+
118
+
87
119
  class FivetranLogConfig(ConfigModel):
88
- destination_platform: Literal["snowflake", "bigquery"] = pydantic.Field(
89
- default="snowflake",
90
- description="The destination platform where fivetran connector log tables are dumped.",
120
+ destination_platform: Literal["snowflake", "bigquery", "databricks"] = (
121
+ pydantic.Field(
122
+ default="snowflake",
123
+ description="The destination platform where fivetran connector log tables are dumped.",
124
+ )
91
125
  )
92
126
  snowflake_destination_config: Optional[SnowflakeDestinationConfig] = pydantic.Field(
93
127
  default=None,
@@ -97,11 +131,17 @@ class FivetranLogConfig(ConfigModel):
97
131
  default=None,
98
132
  description="If destination platform is 'bigquery', provide bigquery configuration.",
99
133
  )
134
+ databricks_destination_config: Optional[DatabricksDestinationConfig] = (
135
+ pydantic.Field(
136
+ default=None,
137
+ description="If destination platform is 'databricks', provide databricks configuration.",
138
+ )
139
+ )
100
140
  _rename_destination_config = pydantic_renamed_field(
101
141
  "destination_config", "snowflake_destination_config"
102
142
  )
103
143
 
104
- @root_validator(pre=True)
144
+ @root_validator(skip_on_failure=True)
105
145
  def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict:
106
146
  destination_platform = values["destination_platform"]
107
147
  if destination_platform == "snowflake":
@@ -114,6 +154,11 @@ class FivetranLogConfig(ConfigModel):
114
154
  raise ValueError(
115
155
  "If destination platform is 'bigquery', user must provide bigquery destination configuration in the recipe."
116
156
  )
157
+ elif destination_platform == "databricks":
158
+ if "databricks_destination_config" not in values:
159
+ raise ValueError(
160
+ "If destination platform is 'databricks', user must provide databricks destination configuration in the recipe."
161
+ )
117
162
  else:
118
163
  raise ValueError(
119
164
  f"Destination platform '{destination_platform}' is not yet supported."
@@ -137,6 +182,7 @@ class MetadataExtractionPerfReport(Report):
137
182
  @dataclasses.dataclass
138
183
  class FivetranSourceReport(StaleEntityRemovalSourceReport):
139
184
  connectors_scanned: int = 0
185
+ fivetran_rest_api_call_count: int = 0
140
186
  filtered_connectors: LossyList[str] = dataclasses.field(default_factory=LossyList)
141
187
  metadata_extraction_perf: MetadataExtractionPerfReport = dataclasses.field(
142
188
  default_factory=MetadataExtractionPerfReport
@@ -148,6 +194,9 @@ class FivetranSourceReport(StaleEntityRemovalSourceReport):
148
194
  def report_connectors_dropped(self, connector: str) -> None:
149
195
  self.filtered_connectors.append(connector)
150
196
 
197
+ def report_fivetran_rest_api_call_count(self) -> None:
198
+ self.fivetran_rest_api_call_count += 1
199
+
151
200
 
152
201
  class PlatformDetail(ConfigModel):
153
202
  platform: Optional[str] = pydantic.Field(
@@ -194,7 +243,7 @@ class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin
194
243
 
195
244
  # Configuration for stateful ingestion
196
245
  stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = pydantic.Field(
197
- default=None, description="Airbyte Stateful Ingestion Config."
246
+ default=None, description="Fivetran Stateful Ingestion Config."
198
247
  )
199
248
 
200
249
  # Fivetran connector all sources to platform instance mapping
@@ -208,6 +257,16 @@ class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin
208
257
  description="A mapping of destination id to its platform/instance/env details.",
209
258
  )
210
259
 
260
+ """
261
+ Use Fivetran REST API to get :
262
+ - Google Sheets Connector details and emit related entities
263
+ Fivetran Platform Connector syncs limited information about the Google Sheets Connector.
264
+ """
265
+ api_config: Optional[FivetranAPIConfig] = Field(
266
+ default=None,
267
+ description="Fivetran REST API configuration, used to provide wider support for connections.",
268
+ )
269
+
211
270
  @pydantic.root_validator(pre=True)
212
271
  def compat_sources_to_database(cls, values: Dict) -> Dict:
213
272
  if "sources_to_database" in values:
@@ -1,8 +1,9 @@
1
1
  import logging
2
- from typing import Dict, Iterable, List, Optional
2
+ from typing import Dict, Iterable, List, Optional, Union
3
+ from urllib.parse import urlparse
3
4
 
4
5
  import datahub.emitter.mce_builder as builder
5
- from datahub.api.entities.datajob import DataFlow, DataJob
6
+ from datahub.api.entities.datajob import DataJob as DataJobV1
6
7
  from datahub.api.entities.dataprocess.dataprocess_instance import (
7
8
  DataProcessInstance,
8
9
  InstanceRunResult,
@@ -16,8 +17,13 @@ from datahub.ingestion.api.decorators import (
16
17
  platform_name,
17
18
  support_status,
18
19
  )
19
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
20
+ from datahub.ingestion.api.source import (
21
+ MetadataWorkUnitProcessor,
22
+ SourceReport,
23
+ StructuredLogCategory,
24
+ )
20
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
26
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
21
27
  from datahub.ingestion.source.fivetran.config import (
22
28
  KNOWN_DATA_PLATFORM_MAPPING,
23
29
  Constant,
@@ -31,27 +37,39 @@ from datahub.ingestion.source.fivetran.fivetran_query import (
31
37
  MAX_JOBS_PER_CONNECTOR,
32
38
  MAX_TABLE_LINEAGE_PER_CONNECTOR,
33
39
  )
40
+ from datahub.ingestion.source.fivetran.fivetran_rest_api import FivetranAPIClient
41
+ from datahub.ingestion.source.fivetran.response_models import FivetranConnectionDetails
34
42
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
35
43
  StaleEntityRemovalHandler,
36
44
  )
37
45
  from datahub.ingestion.source.state.stateful_ingestion_base import (
38
46
  StatefulIngestionSourceBase,
39
47
  )
48
+ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
40
49
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
41
50
  FineGrainedLineage,
42
51
  FineGrainedLineageDownstreamType,
43
52
  FineGrainedLineageUpstreamType,
53
+ UpstreamLineage,
54
+ )
55
+ from datahub.metadata.schema_classes import (
56
+ DatasetLineageTypeClass,
57
+ UpstreamClass,
44
58
  )
45
- from datahub.utilities.urns.data_flow_urn import DataFlowUrn
46
- from datahub.utilities.urns.dataset_urn import DatasetUrn
59
+ from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
60
+ from datahub.sdk.dataflow import DataFlow
61
+ from datahub.sdk.datajob import DataJob
62
+ from datahub.sdk.dataset import Dataset
63
+ from datahub.sdk.entity import Entity
47
64
 
48
65
  # Logger instance
49
66
  logger = logging.getLogger(__name__)
67
+ CORPUSER_DATAHUB = "urn:li:corpuser:datahub"
50
68
 
51
69
 
52
70
  @platform_name("Fivetran")
53
71
  @config_class(FivetranSourceConfig)
54
- @support_status(SupportStatus.INCUBATING)
72
+ @support_status(SupportStatus.CERTIFIED)
55
73
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
56
74
  @capability(
57
75
  SourceCapability.LINEAGE_FINE,
@@ -60,7 +78,6 @@ logger = logging.getLogger(__name__)
60
78
  class FivetranSource(StatefulIngestionSourceBase):
61
79
  """
62
80
  This plugin extracts fivetran users, connectors, destinations and sync history.
63
- This plugin is in beta and has only been tested on Snowflake connector.
64
81
  """
65
82
 
66
83
  config: FivetranSourceConfig
@@ -71,12 +88,16 @@ class FivetranSource(StatefulIngestionSourceBase):
71
88
  super().__init__(config, ctx)
72
89
  self.config = config
73
90
  self.report = FivetranSourceReport()
74
-
75
91
  self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
92
+ self.api_client: Optional[FivetranAPIClient] = None
93
+ self._connection_details_cache: Dict[str, FivetranConnectionDetails] = {}
94
+
95
+ if self.config.api_config:
96
+ self.api_client = FivetranAPIClient(self.config.api_config)
76
97
 
77
98
  def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
78
- input_dataset_urn_list: List[DatasetUrn] = []
79
- output_dataset_urn_list: List[DatasetUrn] = []
99
+ input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
100
+ output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
80
101
  fine_grained_lineage: List[FineGrainedLineage] = []
81
102
 
82
103
  # TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
@@ -94,8 +115,10 @@ class FivetranSource(StatefulIngestionSourceBase):
94
115
  self.report.info(
95
116
  title="Guessing source platform for lineage",
96
117
  message="We encountered a connector type that we don't fully support yet. "
97
- "We will attempt to guess the platform based on the connector type.",
98
- context=f"{connector.connector_name} (connector_id: {connector.connector_id}, connector_type: {connector.connector_type})",
118
+ "We will attempt to guess the platform based on the connector type. "
119
+ "Note that we use connector_id as the key not connector_name which you may see in the UI of Fivetran. ",
120
+ context=f"connector_name: {connector.connector_name} (connector_id: {connector.connector_id}, connector_type: {connector.connector_type})",
121
+ log_category=StructuredLogCategory.LINEAGE,
99
122
  )
100
123
  source_details.platform = connector.connector_type
101
124
 
@@ -124,17 +147,43 @@ class FivetranSource(StatefulIngestionSourceBase):
124
147
  if source_details.include_schema_in_urn
125
148
  else lineage.source_table.split(".", 1)[1]
126
149
  )
127
- input_dataset_urn = DatasetUrn.create_from_ids(
128
- platform_id=source_details.platform,
129
- table_name=(
130
- f"{source_details.database.lower()}.{source_table}"
131
- if source_details.database
132
- else source_table
133
- ),
134
- env=source_details.env,
135
- platform_instance=source_details.platform_instance,
136
- )
137
- input_dataset_urn_list.append(input_dataset_urn)
150
+ input_dataset_urn: Optional[DatasetUrn] = None
151
+ # Special Handling for Google Sheets Connectors
152
+ if connector.connector_type == Constant.GOOGLE_SHEETS_CONNECTOR_TYPE:
153
+ # Get Google Sheet dataset details from Fivetran API
154
+ # This is cached in the api_client
155
+ gsheets_conn_details: Optional[FivetranConnectionDetails] = (
156
+ self._get_connection_details_by_id(connector.connector_id)
157
+ )
158
+
159
+ if gsheets_conn_details:
160
+ input_dataset_urn = DatasetUrn.create_from_ids(
161
+ platform_id=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
162
+ table_name=self._get_gsheet_named_range_dataset_id(
163
+ gsheets_conn_details
164
+ ),
165
+ env=source_details.env,
166
+ )
167
+ else:
168
+ self.report.warning(
169
+ title="Failed to extract lineage for Google Sheets Connector",
170
+ message="Unable to extract lineage for Google Sheets Connector, as the connector details are not available from Fivetran API.",
171
+ context=f"{connector.connector_name} (connector_id: {connector.connector_id})",
172
+ )
173
+ else:
174
+ input_dataset_urn = DatasetUrn.create_from_ids(
175
+ platform_id=source_details.platform,
176
+ table_name=(
177
+ f"{source_details.database.lower()}.{source_table}"
178
+ if source_details.database
179
+ else source_table
180
+ ),
181
+ env=source_details.env,
182
+ platform_instance=source_details.platform_instance,
183
+ )
184
+
185
+ if input_dataset_urn:
186
+ input_dataset_urn_list.append(input_dataset_urn)
138
187
 
139
188
  destination_table = (
140
189
  lineage.destination_table
@@ -178,9 +227,9 @@ class FivetranSource(StatefulIngestionSourceBase):
178
227
  )
179
228
  )
180
229
 
181
- datajob.inlets.extend(input_dataset_urn_list)
182
- datajob.outlets.extend(output_dataset_urn_list)
183
- datajob.fine_grained_lineages.extend(fine_grained_lineage)
230
+ datajob.set_inlets(input_dataset_urn_list)
231
+ datajob.set_outlets(output_dataset_urn_list)
232
+ datajob.set_fine_grained_lineages(fine_grained_lineage)
184
233
 
185
234
  return dict(
186
235
  **{
@@ -197,10 +246,10 @@ class FivetranSource(StatefulIngestionSourceBase):
197
246
 
198
247
  def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
199
248
  return DataFlow(
200
- orchestrator=Constant.ORCHESTRATOR,
201
- id=connector.connector_id,
249
+ platform=Constant.ORCHESTRATOR,
250
+ name=connector.connector_id,
202
251
  env=self.config.env,
203
- name=connector.connector_name,
252
+ display_name=connector.connector_name,
204
253
  platform_instance=self.config.platform_instance,
205
254
  )
206
255
 
@@ -213,10 +262,11 @@ class FivetranSource(StatefulIngestionSourceBase):
213
262
  )
214
263
  owner_email = self.audit_log.get_user_email(connector.user_id)
215
264
  datajob = DataJob(
216
- id=connector.connector_id,
265
+ name=connector.connector_id,
217
266
  flow_urn=dataflow_urn,
218
- name=connector.connector_name,
219
- owners={owner_email} if owner_email else set(),
267
+ platform_instance=self.config.platform_instance,
268
+ display_name=connector.connector_name,
269
+ owners=[CorpUserUrn(owner_email)] if owner_email else None,
220
270
  )
221
271
 
222
272
  # Map connector source and destination table with dataset entity
@@ -231,21 +281,90 @@ class FivetranSource(StatefulIngestionSourceBase):
231
281
  "sync_frequency": str(connector.sync_frequency),
232
282
  "destination_id": connector.destination_id,
233
283
  }
234
- datajob.properties = {
235
- **connector_properties,
236
- **lineage_properties,
237
- }
284
+
285
+ datajob.set_custom_properties({**connector_properties, **lineage_properties})
238
286
 
239
287
  return datajob
240
288
 
241
289
  def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
290
+ # hack: convert to old instance for DataProcessInstance.from_datajob compatibility
291
+ datajob_v1 = DataJobV1(
292
+ id=datajob.name,
293
+ flow_urn=datajob.flow_urn,
294
+ platform_instance=self.config.platform_instance,
295
+ name=datajob.name,
296
+ inlets=datajob.inlets,
297
+ outlets=datajob.outlets,
298
+ fine_grained_lineages=datajob.fine_grained_lineages,
299
+ )
242
300
  return DataProcessInstance.from_datajob(
243
- datajob=datajob,
301
+ datajob=datajob_v1,
244
302
  id=job.job_id,
245
303
  clone_inlets=True,
246
304
  clone_outlets=True,
247
305
  )
248
306
 
307
+ def _get_connection_details_by_id(
308
+ self, connection_id: str
309
+ ) -> Optional[FivetranConnectionDetails]:
310
+ if self.api_client is None:
311
+ self.report.warning(
312
+ title="Fivetran API client is not initialized",
313
+ message="Google Sheets Connector details cannot be extracted, as Fivetran API client is not initialized.",
314
+ context=f"connector_id: {connection_id}",
315
+ )
316
+ return None
317
+
318
+ if connection_id in self._connection_details_cache:
319
+ return self._connection_details_cache[connection_id]
320
+
321
+ try:
322
+ self.report.report_fivetran_rest_api_call_count()
323
+ conn_details = self.api_client.get_connection_details_by_id(connection_id)
324
+ # Update Cache
325
+ if conn_details:
326
+ self._connection_details_cache[connection_id] = conn_details
327
+
328
+ return conn_details
329
+ except Exception as e:
330
+ self.report.warning(
331
+ title="Failed to get connection details for Google Sheets Connector",
332
+ message=f"Exception occurred while getting connection details from Fivetran API. {e}",
333
+ context=f"connector_id: {connection_id}",
334
+ )
335
+ return None
336
+
337
+ def _get_gsheet_sheet_id_from_url(
338
+ self, gsheets_conn_details: FivetranConnectionDetails
339
+ ) -> str:
340
+ # Extracting the sheet_id (1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo) from the sheet_id url
341
+ # "https://docs.google.com/spreadsheets/d/1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo/edit?gid=0#gid=0",
342
+ try:
343
+ parsed = urlparse(gsheets_conn_details.config.sheet_id)
344
+ # Example: https://docs.google.com/spreadsheets/d/<spreadsheetId>/edit
345
+ parts = parsed.path.split("/")
346
+ return parts[3] if len(parts) > 2 else ""
347
+ except Exception as e:
348
+ logger.warning(
349
+ f"Failed to extract sheet_id from the sheet_id url: {gsheets_conn_details.config.sheet_id}, {e}"
350
+ )
351
+
352
+ return ""
353
+
354
+ def _get_gsheet_named_range_dataset_id(
355
+ self, gsheets_conn_details: FivetranConnectionDetails
356
+ ) -> str:
357
+ sheet_id = self._get_gsheet_sheet_id_from_url(gsheets_conn_details)
358
+ named_range_id = (
359
+ f"{sheet_id}.{gsheets_conn_details.config.named_range}"
360
+ if sheet_id
361
+ else gsheets_conn_details.config.named_range
362
+ )
363
+ logger.debug(
364
+ f"Using gsheet_named_range_dataset_id: {named_range_id} for connector: {gsheets_conn_details.id}"
365
+ )
366
+ return named_range_id
367
+
249
368
  def _get_dpi_workunits(
250
369
  self, job: Job, dpi: DataProcessInstance
251
370
  ) -> Iterable[MetadataWorkUnit]:
@@ -277,17 +396,83 @@ class FivetranSource(StatefulIngestionSourceBase):
277
396
 
278
397
  def _get_connector_workunits(
279
398
  self, connector: Connector
280
- ) -> Iterable[MetadataWorkUnit]:
399
+ ) -> Iterable[Union[MetadataWorkUnit, Entity]]:
281
400
  self.report.report_connectors_scanned()
401
+
402
+ """
403
+ -------------------------------------------------------
404
+ Special Handling for Google Sheets Connectors
405
+ -------------------------------------------------------
406
+ Google Sheets source is not supported by Datahub yet.
407
+ As a workaround, we are emitting a dataset entity for the Google Sheet
408
+ and adding it to the lineage. This workaround needs to be removed once
409
+ Datahub supports Google Sheets source natively.
410
+ -------------------------------------------------------
411
+ """
412
+ if connector.connector_type == Constant.GOOGLE_SHEETS_CONNECTOR_TYPE:
413
+ # Get Google Sheet dataset details from Fivetran API
414
+ gsheets_conn_details: Optional[FivetranConnectionDetails] = (
415
+ self._get_connection_details_by_id(connector.connector_id)
416
+ )
417
+
418
+ if gsheets_conn_details:
419
+ gsheets_dataset = Dataset(
420
+ name=self._get_gsheet_sheet_id_from_url(gsheets_conn_details),
421
+ platform=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
422
+ env=self.config.env,
423
+ display_name=self._get_gsheet_sheet_id_from_url(
424
+ gsheets_conn_details
425
+ ),
426
+ external_url=gsheets_conn_details.config.sheet_id,
427
+ created=gsheets_conn_details.created_at,
428
+ last_modified=gsheets_conn_details.source_sync_details.last_synced,
429
+ subtype=DatasetSubTypes.GOOGLE_SHEETS,
430
+ custom_properties={
431
+ "ingested_by": "fivetran source",
432
+ "connector_id": gsheets_conn_details.id,
433
+ },
434
+ )
435
+ gsheets_named_range_dataset = Dataset(
436
+ name=self._get_gsheet_named_range_dataset_id(gsheets_conn_details),
437
+ platform=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
438
+ env=self.config.env,
439
+ display_name=gsheets_conn_details.config.named_range,
440
+ external_url=gsheets_conn_details.config.sheet_id,
441
+ created=gsheets_conn_details.created_at,
442
+ last_modified=gsheets_conn_details.source_sync_details.last_synced,
443
+ subtype=DatasetSubTypes.GOOGLE_SHEETS_NAMED_RANGE,
444
+ custom_properties={
445
+ "ingested_by": "fivetran source",
446
+ "connector_id": gsheets_conn_details.id,
447
+ },
448
+ upstreams=UpstreamLineage(
449
+ upstreams=[
450
+ UpstreamClass(
451
+ dataset=str(gsheets_dataset.urn),
452
+ type=DatasetLineageTypeClass.VIEW,
453
+ auditStamp=AuditStamp(
454
+ time=int(
455
+ gsheets_conn_details.created_at.timestamp()
456
+ * 1000
457
+ ),
458
+ actor=CORPUSER_DATAHUB,
459
+ ),
460
+ )
461
+ ],
462
+ fineGrainedLineages=None,
463
+ ),
464
+ )
465
+
466
+ yield gsheets_dataset
467
+ yield gsheets_named_range_dataset
468
+
282
469
  # Create dataflow entity with same name as connector name
283
470
  dataflow = self._generate_dataflow_from_connector(connector)
284
- for mcp in dataflow.generate_mcp():
285
- yield mcp.as_workunit()
471
+ yield dataflow
286
472
 
287
473
  # Map Fivetran's connector entity with Datahub's datajob entity
288
474
  datajob = self._generate_datajob_from_connector(connector)
289
- for mcp in datajob.generate_mcp(materialize_iolets=False):
290
- yield mcp.as_workunit()
475
+ yield datajob
291
476
 
292
477
  # Map Fivetran's job/sync history entity with Datahub's data process entity
293
478
  if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
@@ -309,7 +494,7 @@ class FivetranSource(StatefulIngestionSourceBase):
309
494
  ).workunit_processor,
310
495
  ]
311
496
 
312
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
497
+ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
313
498
  """
314
499
  Datahub Ingestion framework invoke this method
315
500
  """