acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
+ import pathlib
1
2
  from dataclasses import dataclass
2
- from typing import Dict, Iterable, List, Optional, Tuple, Union
3
+ from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Union
3
4
 
4
5
  import feast.types
5
6
  from feast import (
@@ -97,7 +98,7 @@ class FeastRepositorySourceConfig(
97
98
  StatefulIngestionConfigBase,
98
99
  ):
99
100
  path: str = Field(description="Path to Feast repository")
100
- fs_yaml_file: Optional[str] = Field(
101
+ fs_yaml_file: Optional[pathlib.Path] = Field(
101
102
  default=None,
102
103
  description="Path to the `feature_store.yaml` file used to configure the feature store",
103
104
  )
@@ -142,17 +143,14 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
142
143
  - Column types associated with each entity and feature
143
144
  """
144
145
 
145
- platform = "feast"
146
- source_config: FeastRepositorySourceConfig
147
- report: StaleEntityRemovalSourceReport
148
- feature_store: FeatureStore
146
+ platform: ClassVar[str] = "feast"
149
147
 
150
148
  def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
151
149
  super().__init__(config, ctx)
152
- self.source_config = config
153
- self.ctx = ctx
154
- self.report = StaleEntityRemovalSourceReport()
155
- self.feature_store = FeatureStore(
150
+ self.source_config: FeastRepositorySourceConfig = config
151
+ self.ctx: PipelineContext = ctx
152
+ self.report: StaleEntityRemovalSourceReport = StaleEntityRemovalSourceReport()
153
+ self.feature_store: FeatureStore = FeatureStore(
156
154
  repo_path=self.source_config.path,
157
155
  fs_yaml_file=self.source_config.fs_yaml_file,
158
156
  )
@@ -18,7 +18,9 @@ from datahub.configuration.validate_field_rename import pydantic_renamed_field
18
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
19
  from datahub.ingestion.api.common import PipelineContext
20
20
  from datahub.ingestion.api.decorators import (
21
+ SourceCapability,
21
22
  SupportStatus,
23
+ capability,
22
24
  config_class,
23
25
  platform_name,
24
26
  support_status,
@@ -187,6 +189,7 @@ class FileSourceReport(StaleEntityRemovalSourceReport):
187
189
  @platform_name("Metadata File")
188
190
  @config_class(FileSourceConfig)
189
191
  @support_status(SupportStatus.CERTIFIED)
192
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
190
193
  class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
191
194
  """
192
195
  This plugin pulls metadata from a previously generated file.
@@ -29,6 +29,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
29
29
  from datahub.ingestion.source.state.stateful_ingestion_base import (
30
30
  StatefulIngestionConfigBase,
31
31
  )
32
+ from datahub.ingestion.source.unity.connection import UnityCatalogConnectionConfig
32
33
  from datahub.utilities.lossy_collections import LossyList
33
34
  from datahub.utilities.perf_timer import PerfTimer
34
35
 
@@ -56,8 +57,8 @@ class Constant:
56
57
  STATUS = "status"
57
58
  USER_ID = "user_id"
58
59
  EMAIL = "email"
59
- CONNECTOR_ID = "connector_id"
60
- CONNECTOR_NAME = "connector_name"
60
+ CONNECTOR_ID = "connection_id"
61
+ CONNECTOR_NAME = "connection_name"
61
62
  CONNECTOR_TYPE_ID = "connector_type_id"
62
63
  PAUSED = "paused"
63
64
  SYNC_FREQUENCY = "sync_frequency"
@@ -67,13 +68,22 @@ class Constant:
67
68
  SUCCESSFUL = "SUCCESSFUL"
68
69
  FAILURE_WITH_TASK = "FAILURE_WITH_TASK"
69
70
  CANCELED = "CANCELED"
71
+ GOOGLE_SHEETS_CONNECTOR_TYPE = "google_sheets"
70
72
 
71
73
 
74
+ # Key: Connector Type, Value: Platform ID/Name
72
75
  KNOWN_DATA_PLATFORM_MAPPING = {
76
+ "google_cloud_postgresql": "postgres",
73
77
  "postgres": "postgres",
74
78
  "snowflake": "snowflake",
79
+ Constant.GOOGLE_SHEETS_CONNECTOR_TYPE: Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
75
80
  }
76
81
 
82
+ # Note: (As of Oct 2025) Fivetran Platform Connector has stale lineage metadata for Google Sheets column data (deleted/renamed).
83
+ # Ref: https://fivetran.com/docs/connectors/files/google-sheets#deletingdata
84
+ # TODO: Remove Google Sheets connector type from DISABLE_LINEAGE_FOR_CONNECTOR_TYPES
85
+ DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES = [Constant.GOOGLE_SHEETS_CONNECTOR_TYPE]
86
+
77
87
 
78
88
  class SnowflakeDestinationConfig(SnowflakeConnectionConfig):
79
89
  database: str = Field(description="The fivetran connector log database.")
@@ -84,10 +94,34 @@ class BigQueryDestinationConfig(BigQueryConnectionConfig):
84
94
  dataset: str = Field(description="The fivetran connector log dataset.")
85
95
 
86
96
 
97
+ class DatabricksDestinationConfig(UnityCatalogConnectionConfig):
98
+ catalog: str = Field(description="The fivetran connector log catalog.")
99
+ log_schema: str = Field(description="The fivetran connector log schema.")
100
+
101
+ @pydantic.validator("warehouse_id")
102
+ def warehouse_id_should_not_be_empty(cls, warehouse_id: Optional[str]) -> str:
103
+ if warehouse_id is None or (warehouse_id and warehouse_id.strip() == ""):
104
+ raise ValueError("Fivetran requires warehouse_id to be set")
105
+ return warehouse_id
106
+
107
+
108
+ class FivetranAPIConfig(ConfigModel):
109
+ api_key: str = Field(description="Fivetran API key")
110
+ api_secret: str = Field(description="Fivetran API secret")
111
+ base_url: str = Field(
112
+ default="https://api.fivetran.com", description="Fivetran API base URL"
113
+ )
114
+ request_timeout_sec: int = Field(
115
+ default=30, description="Request timeout in seconds"
116
+ )
117
+
118
+
87
119
  class FivetranLogConfig(ConfigModel):
88
- destination_platform: Literal["snowflake", "bigquery"] = pydantic.Field(
89
- default="snowflake",
90
- description="The destination platform where fivetran connector log tables are dumped.",
120
+ destination_platform: Literal["snowflake", "bigquery", "databricks"] = (
121
+ pydantic.Field(
122
+ default="snowflake",
123
+ description="The destination platform where fivetran connector log tables are dumped.",
124
+ )
91
125
  )
92
126
  snowflake_destination_config: Optional[SnowflakeDestinationConfig] = pydantic.Field(
93
127
  default=None,
@@ -97,11 +131,17 @@ class FivetranLogConfig(ConfigModel):
97
131
  default=None,
98
132
  description="If destination platform is 'bigquery', provide bigquery configuration.",
99
133
  )
134
+ databricks_destination_config: Optional[DatabricksDestinationConfig] = (
135
+ pydantic.Field(
136
+ default=None,
137
+ description="If destination platform is 'databricks', provide databricks configuration.",
138
+ )
139
+ )
100
140
  _rename_destination_config = pydantic_renamed_field(
101
141
  "destination_config", "snowflake_destination_config"
102
142
  )
103
143
 
104
- @root_validator(pre=True)
144
+ @root_validator(skip_on_failure=True)
105
145
  def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict:
106
146
  destination_platform = values["destination_platform"]
107
147
  if destination_platform == "snowflake":
@@ -114,6 +154,11 @@ class FivetranLogConfig(ConfigModel):
114
154
  raise ValueError(
115
155
  "If destination platform is 'bigquery', user must provide bigquery destination configuration in the recipe."
116
156
  )
157
+ elif destination_platform == "databricks":
158
+ if "databricks_destination_config" not in values:
159
+ raise ValueError(
160
+ "If destination platform is 'databricks', user must provide databricks destination configuration in the recipe."
161
+ )
117
162
  else:
118
163
  raise ValueError(
119
164
  f"Destination platform '{destination_platform}' is not yet supported."
@@ -137,6 +182,7 @@ class MetadataExtractionPerfReport(Report):
137
182
  @dataclasses.dataclass
138
183
  class FivetranSourceReport(StaleEntityRemovalSourceReport):
139
184
  connectors_scanned: int = 0
185
+ fivetran_rest_api_call_count: int = 0
140
186
  filtered_connectors: LossyList[str] = dataclasses.field(default_factory=LossyList)
141
187
  metadata_extraction_perf: MetadataExtractionPerfReport = dataclasses.field(
142
188
  default_factory=MetadataExtractionPerfReport
@@ -148,6 +194,9 @@ class FivetranSourceReport(StaleEntityRemovalSourceReport):
148
194
  def report_connectors_dropped(self, connector: str) -> None:
149
195
  self.filtered_connectors.append(connector)
150
196
 
197
+ def report_fivetran_rest_api_call_count(self) -> None:
198
+ self.fivetran_rest_api_call_count += 1
199
+
151
200
 
152
201
  class PlatformDetail(ConfigModel):
153
202
  platform: Optional[str] = pydantic.Field(
@@ -194,7 +243,7 @@ class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin
194
243
 
195
244
  # Configuration for stateful ingestion
196
245
  stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = pydantic.Field(
197
- default=None, description="Airbyte Stateful Ingestion Config."
246
+ default=None, description="Fivetran Stateful Ingestion Config."
198
247
  )
199
248
 
200
249
  # Fivetran connector all sources to platform instance mapping
@@ -208,6 +257,16 @@ class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin
208
257
  description="A mapping of destination id to its platform/instance/env details.",
209
258
  )
210
259
 
260
+ """
261
+ Use Fivetran REST API to get :
262
+ - Google Sheets Connector details and emit related entities
263
+ Fivetran Platform Connector syncs limited information about the Google Sheets Connector.
264
+ """
265
+ api_config: Optional[FivetranAPIConfig] = Field(
266
+ default=None,
267
+ description="Fivetran REST API configuration, used to provide wider support for connections.",
268
+ )
269
+
211
270
  @pydantic.root_validator(pre=True)
212
271
  def compat_sources_to_database(cls, values: Dict) -> Dict:
213
272
  if "sources_to_database" in values:
@@ -1,8 +1,9 @@
1
1
  import logging
2
- from typing import Dict, Iterable, List, Optional
2
+ from typing import Dict, Iterable, List, Optional, Union
3
+ from urllib.parse import urlparse
3
4
 
4
5
  import datahub.emitter.mce_builder as builder
5
- from datahub.api.entities.datajob import DataFlow, DataJob
6
+ from datahub.api.entities.datajob import DataJob as DataJobV1
6
7
  from datahub.api.entities.dataprocess.dataprocess_instance import (
7
8
  DataProcessInstance,
8
9
  InstanceRunResult,
@@ -16,8 +17,13 @@ from datahub.ingestion.api.decorators import (
16
17
  platform_name,
17
18
  support_status,
18
19
  )
19
- from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
20
+ from datahub.ingestion.api.source import (
21
+ MetadataWorkUnitProcessor,
22
+ SourceReport,
23
+ StructuredLogCategory,
24
+ )
20
25
  from datahub.ingestion.api.workunit import MetadataWorkUnit
26
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
21
27
  from datahub.ingestion.source.fivetran.config import (
22
28
  KNOWN_DATA_PLATFORM_MAPPING,
23
29
  Constant,
@@ -31,27 +37,39 @@ from datahub.ingestion.source.fivetran.fivetran_query import (
31
37
  MAX_JOBS_PER_CONNECTOR,
32
38
  MAX_TABLE_LINEAGE_PER_CONNECTOR,
33
39
  )
40
+ from datahub.ingestion.source.fivetran.fivetran_rest_api import FivetranAPIClient
41
+ from datahub.ingestion.source.fivetran.response_models import FivetranConnectionDetails
34
42
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
35
43
  StaleEntityRemovalHandler,
36
44
  )
37
45
  from datahub.ingestion.source.state.stateful_ingestion_base import (
38
46
  StatefulIngestionSourceBase,
39
47
  )
48
+ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
40
49
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
41
50
  FineGrainedLineage,
42
51
  FineGrainedLineageDownstreamType,
43
52
  FineGrainedLineageUpstreamType,
53
+ UpstreamLineage,
54
+ )
55
+ from datahub.metadata.schema_classes import (
56
+ DatasetLineageTypeClass,
57
+ UpstreamClass,
44
58
  )
45
- from datahub.utilities.urns.data_flow_urn import DataFlowUrn
46
- from datahub.utilities.urns.dataset_urn import DatasetUrn
59
+ from datahub.metadata.urns import CorpUserUrn, DataFlowUrn, DatasetUrn
60
+ from datahub.sdk.dataflow import DataFlow
61
+ from datahub.sdk.datajob import DataJob
62
+ from datahub.sdk.dataset import Dataset
63
+ from datahub.sdk.entity import Entity
47
64
 
48
65
  # Logger instance
49
66
  logger = logging.getLogger(__name__)
67
+ CORPUSER_DATAHUB = "urn:li:corpuser:datahub"
50
68
 
51
69
 
52
70
  @platform_name("Fivetran")
53
71
  @config_class(FivetranSourceConfig)
54
- @support_status(SupportStatus.INCUBATING)
72
+ @support_status(SupportStatus.CERTIFIED)
55
73
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
56
74
  @capability(
57
75
  SourceCapability.LINEAGE_FINE,
@@ -60,7 +78,6 @@ logger = logging.getLogger(__name__)
60
78
  class FivetranSource(StatefulIngestionSourceBase):
61
79
  """
62
80
  This plugin extracts fivetran users, connectors, destinations and sync history.
63
- This plugin is in beta and has only been tested on Snowflake connector.
64
81
  """
65
82
 
66
83
  config: FivetranSourceConfig
@@ -71,12 +88,16 @@ class FivetranSource(StatefulIngestionSourceBase):
71
88
  super().__init__(config, ctx)
72
89
  self.config = config
73
90
  self.report = FivetranSourceReport()
74
-
75
91
  self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
92
+ self.api_client: Optional[FivetranAPIClient] = None
93
+ self._connection_details_cache: Dict[str, FivetranConnectionDetails] = {}
94
+
95
+ if self.config.api_config:
96
+ self.api_client = FivetranAPIClient(self.config.api_config)
76
97
 
77
98
  def _extend_lineage(self, connector: Connector, datajob: DataJob) -> Dict[str, str]:
78
- input_dataset_urn_list: List[DatasetUrn] = []
79
- output_dataset_urn_list: List[DatasetUrn] = []
99
+ input_dataset_urn_list: List[Union[str, DatasetUrn]] = []
100
+ output_dataset_urn_list: List[Union[str, DatasetUrn]] = []
80
101
  fine_grained_lineage: List[FineGrainedLineage] = []
81
102
 
82
103
  # TODO: Once Fivetran exposes the database via the API, we shouldn't ask for it via config.
@@ -94,8 +115,10 @@ class FivetranSource(StatefulIngestionSourceBase):
94
115
  self.report.info(
95
116
  title="Guessing source platform for lineage",
96
117
  message="We encountered a connector type that we don't fully support yet. "
97
- "We will attempt to guess the platform based on the connector type.",
98
- context=f"{connector.connector_name} (connector_id: {connector.connector_id}, connector_type: {connector.connector_type})",
118
+ "We will attempt to guess the platform based on the connector type. "
119
+ "Note that we use connector_id as the key not connector_name which you may see in the UI of Fivetran. ",
120
+ context=f"connector_name: {connector.connector_name} (connector_id: {connector.connector_id}, connector_type: {connector.connector_type})",
121
+ log_category=StructuredLogCategory.LINEAGE,
99
122
  )
100
123
  source_details.platform = connector.connector_type
101
124
 
@@ -124,17 +147,43 @@ class FivetranSource(StatefulIngestionSourceBase):
124
147
  if source_details.include_schema_in_urn
125
148
  else lineage.source_table.split(".", 1)[1]
126
149
  )
127
- input_dataset_urn = DatasetUrn.create_from_ids(
128
- platform_id=source_details.platform,
129
- table_name=(
130
- f"{source_details.database.lower()}.{source_table}"
131
- if source_details.database
132
- else source_table
133
- ),
134
- env=source_details.env,
135
- platform_instance=source_details.platform_instance,
136
- )
137
- input_dataset_urn_list.append(input_dataset_urn)
150
+ input_dataset_urn: Optional[DatasetUrn] = None
151
+ # Special Handling for Google Sheets Connectors
152
+ if connector.connector_type == Constant.GOOGLE_SHEETS_CONNECTOR_TYPE:
153
+ # Get Google Sheet dataset details from Fivetran API
154
+ # This is cached in the api_client
155
+ gsheets_conn_details: Optional[FivetranConnectionDetails] = (
156
+ self._get_connection_details_by_id(connector.connector_id)
157
+ )
158
+
159
+ if gsheets_conn_details:
160
+ input_dataset_urn = DatasetUrn.create_from_ids(
161
+ platform_id=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
162
+ table_name=self._get_gsheet_named_range_dataset_id(
163
+ gsheets_conn_details
164
+ ),
165
+ env=source_details.env,
166
+ )
167
+ else:
168
+ self.report.warning(
169
+ title="Failed to extract lineage for Google Sheets Connector",
170
+ message="Unable to extract lineage for Google Sheets Connector, as the connector details are not available from Fivetran API.",
171
+ context=f"{connector.connector_name} (connector_id: {connector.connector_id})",
172
+ )
173
+ else:
174
+ input_dataset_urn = DatasetUrn.create_from_ids(
175
+ platform_id=source_details.platform,
176
+ table_name=(
177
+ f"{source_details.database.lower()}.{source_table}"
178
+ if source_details.database
179
+ else source_table
180
+ ),
181
+ env=source_details.env,
182
+ platform_instance=source_details.platform_instance,
183
+ )
184
+
185
+ if input_dataset_urn:
186
+ input_dataset_urn_list.append(input_dataset_urn)
138
187
 
139
188
  destination_table = (
140
189
  lineage.destination_table
@@ -178,9 +227,9 @@ class FivetranSource(StatefulIngestionSourceBase):
178
227
  )
179
228
  )
180
229
 
181
- datajob.inlets.extend(input_dataset_urn_list)
182
- datajob.outlets.extend(output_dataset_urn_list)
183
- datajob.fine_grained_lineages.extend(fine_grained_lineage)
230
+ datajob.set_inlets(input_dataset_urn_list)
231
+ datajob.set_outlets(output_dataset_urn_list)
232
+ datajob.set_fine_grained_lineages(fine_grained_lineage)
184
233
 
185
234
  return dict(
186
235
  **{
@@ -197,10 +246,10 @@ class FivetranSource(StatefulIngestionSourceBase):
197
246
 
198
247
  def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
199
248
  return DataFlow(
200
- orchestrator=Constant.ORCHESTRATOR,
201
- id=connector.connector_id,
249
+ platform=Constant.ORCHESTRATOR,
250
+ name=connector.connector_id,
202
251
  env=self.config.env,
203
- name=connector.connector_name,
252
+ display_name=connector.connector_name,
204
253
  platform_instance=self.config.platform_instance,
205
254
  )
206
255
 
@@ -213,11 +262,11 @@ class FivetranSource(StatefulIngestionSourceBase):
213
262
  )
214
263
  owner_email = self.audit_log.get_user_email(connector.user_id)
215
264
  datajob = DataJob(
216
- id=connector.connector_id,
265
+ name=connector.connector_id,
217
266
  flow_urn=dataflow_urn,
218
267
  platform_instance=self.config.platform_instance,
219
- name=connector.connector_name,
220
- owners={owner_email} if owner_email else set(),
268
+ display_name=connector.connector_name,
269
+ owners=[CorpUserUrn(owner_email)] if owner_email else None,
221
270
  )
222
271
 
223
272
  # Map connector source and destination table with dataset entity
@@ -232,21 +281,90 @@ class FivetranSource(StatefulIngestionSourceBase):
232
281
  "sync_frequency": str(connector.sync_frequency),
233
282
  "destination_id": connector.destination_id,
234
283
  }
235
- datajob.properties = {
236
- **connector_properties,
237
- **lineage_properties,
238
- }
284
+
285
+ datajob.set_custom_properties({**connector_properties, **lineage_properties})
239
286
 
240
287
  return datajob
241
288
 
242
289
  def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
290
+ # hack: convert to old instance for DataProcessInstance.from_datajob compatibility
291
+ datajob_v1 = DataJobV1(
292
+ id=datajob.name,
293
+ flow_urn=datajob.flow_urn,
294
+ platform_instance=self.config.platform_instance,
295
+ name=datajob.name,
296
+ inlets=datajob.inlets,
297
+ outlets=datajob.outlets,
298
+ fine_grained_lineages=datajob.fine_grained_lineages,
299
+ )
243
300
  return DataProcessInstance.from_datajob(
244
- datajob=datajob,
301
+ datajob=datajob_v1,
245
302
  id=job.job_id,
246
303
  clone_inlets=True,
247
304
  clone_outlets=True,
248
305
  )
249
306
 
307
+ def _get_connection_details_by_id(
308
+ self, connection_id: str
309
+ ) -> Optional[FivetranConnectionDetails]:
310
+ if self.api_client is None:
311
+ self.report.warning(
312
+ title="Fivetran API client is not initialized",
313
+ message="Google Sheets Connector details cannot be extracted, as Fivetran API client is not initialized.",
314
+ context=f"connector_id: {connection_id}",
315
+ )
316
+ return None
317
+
318
+ if connection_id in self._connection_details_cache:
319
+ return self._connection_details_cache[connection_id]
320
+
321
+ try:
322
+ self.report.report_fivetran_rest_api_call_count()
323
+ conn_details = self.api_client.get_connection_details_by_id(connection_id)
324
+ # Update Cache
325
+ if conn_details:
326
+ self._connection_details_cache[connection_id] = conn_details
327
+
328
+ return conn_details
329
+ except Exception as e:
330
+ self.report.warning(
331
+ title="Failed to get connection details for Google Sheets Connector",
332
+ message=f"Exception occurred while getting connection details from Fivetran API. {e}",
333
+ context=f"connector_id: {connection_id}",
334
+ )
335
+ return None
336
+
337
+ def _get_gsheet_sheet_id_from_url(
338
+ self, gsheets_conn_details: FivetranConnectionDetails
339
+ ) -> str:
340
+ # Extracting the sheet_id (1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo) from the sheet_id url
341
+ # "https://docs.google.com/spreadsheets/d/1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo/edit?gid=0#gid=0",
342
+ try:
343
+ parsed = urlparse(gsheets_conn_details.config.sheet_id)
344
+ # Example: https://docs.google.com/spreadsheets/d/<spreadsheetId>/edit
345
+ parts = parsed.path.split("/")
346
+ return parts[3] if len(parts) > 2 else ""
347
+ except Exception as e:
348
+ logger.warning(
349
+ f"Failed to extract sheet_id from the sheet_id url: {gsheets_conn_details.config.sheet_id}, {e}"
350
+ )
351
+
352
+ return ""
353
+
354
+ def _get_gsheet_named_range_dataset_id(
355
+ self, gsheets_conn_details: FivetranConnectionDetails
356
+ ) -> str:
357
+ sheet_id = self._get_gsheet_sheet_id_from_url(gsheets_conn_details)
358
+ named_range_id = (
359
+ f"{sheet_id}.{gsheets_conn_details.config.named_range}"
360
+ if sheet_id
361
+ else gsheets_conn_details.config.named_range
362
+ )
363
+ logger.debug(
364
+ f"Using gsheet_named_range_dataset_id: {named_range_id} for connector: {gsheets_conn_details.id}"
365
+ )
366
+ return named_range_id
367
+
250
368
  def _get_dpi_workunits(
251
369
  self, job: Job, dpi: DataProcessInstance
252
370
  ) -> Iterable[MetadataWorkUnit]:
@@ -278,17 +396,83 @@ class FivetranSource(StatefulIngestionSourceBase):
278
396
 
279
397
  def _get_connector_workunits(
280
398
  self, connector: Connector
281
- ) -> Iterable[MetadataWorkUnit]:
399
+ ) -> Iterable[Union[MetadataWorkUnit, Entity]]:
282
400
  self.report.report_connectors_scanned()
401
+
402
+ """
403
+ -------------------------------------------------------
404
+ Special Handling for Google Sheets Connectors
405
+ -------------------------------------------------------
406
+ Google Sheets source is not supported by Datahub yet.
407
+ As a workaround, we are emitting a dataset entity for the Google Sheet
408
+ and adding it to the lineage. This workaround needs to be removed once
409
+ Datahub supports Google Sheets source natively.
410
+ -------------------------------------------------------
411
+ """
412
+ if connector.connector_type == Constant.GOOGLE_SHEETS_CONNECTOR_TYPE:
413
+ # Get Google Sheet dataset details from Fivetran API
414
+ gsheets_conn_details: Optional[FivetranConnectionDetails] = (
415
+ self._get_connection_details_by_id(connector.connector_id)
416
+ )
417
+
418
+ if gsheets_conn_details:
419
+ gsheets_dataset = Dataset(
420
+ name=self._get_gsheet_sheet_id_from_url(gsheets_conn_details),
421
+ platform=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
422
+ env=self.config.env,
423
+ display_name=self._get_gsheet_sheet_id_from_url(
424
+ gsheets_conn_details
425
+ ),
426
+ external_url=gsheets_conn_details.config.sheet_id,
427
+ created=gsheets_conn_details.created_at,
428
+ last_modified=gsheets_conn_details.source_sync_details.last_synced,
429
+ subtype=DatasetSubTypes.GOOGLE_SHEETS,
430
+ custom_properties={
431
+ "ingested_by": "fivetran source",
432
+ "connector_id": gsheets_conn_details.id,
433
+ },
434
+ )
435
+ gsheets_named_range_dataset = Dataset(
436
+ name=self._get_gsheet_named_range_dataset_id(gsheets_conn_details),
437
+ platform=Constant.GOOGLE_SHEETS_CONNECTOR_TYPE,
438
+ env=self.config.env,
439
+ display_name=gsheets_conn_details.config.named_range,
440
+ external_url=gsheets_conn_details.config.sheet_id,
441
+ created=gsheets_conn_details.created_at,
442
+ last_modified=gsheets_conn_details.source_sync_details.last_synced,
443
+ subtype=DatasetSubTypes.GOOGLE_SHEETS_NAMED_RANGE,
444
+ custom_properties={
445
+ "ingested_by": "fivetran source",
446
+ "connector_id": gsheets_conn_details.id,
447
+ },
448
+ upstreams=UpstreamLineage(
449
+ upstreams=[
450
+ UpstreamClass(
451
+ dataset=str(gsheets_dataset.urn),
452
+ type=DatasetLineageTypeClass.VIEW,
453
+ auditStamp=AuditStamp(
454
+ time=int(
455
+ gsheets_conn_details.created_at.timestamp()
456
+ * 1000
457
+ ),
458
+ actor=CORPUSER_DATAHUB,
459
+ ),
460
+ )
461
+ ],
462
+ fineGrainedLineages=None,
463
+ ),
464
+ )
465
+
466
+ yield gsheets_dataset
467
+ yield gsheets_named_range_dataset
468
+
283
469
  # Create dataflow entity with same name as connector name
284
470
  dataflow = self._generate_dataflow_from_connector(connector)
285
- for mcp in dataflow.generate_mcp():
286
- yield mcp.as_workunit()
471
+ yield dataflow
287
472
 
288
473
  # Map Fivetran's connector entity with Datahub's datajob entity
289
474
  datajob = self._generate_datajob_from_connector(connector)
290
- for mcp in datajob.generate_mcp(materialize_iolets=False):
291
- yield mcp.as_workunit()
475
+ yield datajob
292
476
 
293
477
  # Map Fivetran's job/sync history entity with Datahub's data process entity
294
478
  if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
@@ -310,7 +494,7 @@ class FivetranSource(StatefulIngestionSourceBase):
310
494
  ).workunit_processor,
311
495
  ]
312
496
 
313
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
497
+ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
314
498
  """
315
499
  Datahub Ingestion framework invoke this method
316
500
  """