acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,7 @@ from sqlalchemy import create_engine
9
9
 
10
10
  from datahub.configuration.common import AllowDenyPattern, ConfigurationError
11
11
  from datahub.ingestion.source.fivetran.config import (
12
+ DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES,
12
13
  Constant,
13
14
  FivetranLogConfig,
14
15
  FivetranSourceReport,
@@ -69,9 +70,23 @@ class FivetranLogAPI:
69
70
  fivetran_log_query.set_schema(bigquery_destination_config.dataset)
70
71
 
71
72
  # The "database" should be the BigQuery project name.
72
- fivetran_log_database = engine.execute(
73
- "SELECT @@project_id"
74
- ).fetchone()[0]
73
+ result = engine.execute("SELECT @@project_id").fetchone()
74
+ if result is None:
75
+ raise ValueError("Failed to retrieve BigQuery project ID")
76
+ fivetran_log_database = result[0]
77
+ elif destination_platform == "databricks":
78
+ databricks_destination_config = (
79
+ self.fivetran_log_config.databricks_destination_config
80
+ )
81
+ if databricks_destination_config is not None:
82
+ engine = create_engine(
83
+ databricks_destination_config.get_sql_alchemy_url(
84
+ databricks_destination_config.catalog
85
+ ),
86
+ **databricks_destination_config.get_options(),
87
+ )
88
+ fivetran_log_query.set_schema(databricks_destination_config.log_schema)
89
+ fivetran_log_database = databricks_destination_config.catalog
75
90
  else:
76
91
  raise ConfigurationError(
77
92
  f"Destination platform '{destination_platform}' is not yet supported."
@@ -98,7 +113,11 @@ class FivetranLogAPI:
98
113
  """
99
114
  Returns dict of column lineage metadata with key as (<SOURCE_TABLE_ID>, <DESTINATION_TABLE_ID>)
100
115
  """
101
- all_column_lineage = defaultdict(list)
116
+ all_column_lineage: Dict[Tuple[str, str], List] = defaultdict(list)
117
+
118
+ if not connector_ids:
119
+ return dict(all_column_lineage)
120
+
102
121
  column_lineage_result = self._query(
103
122
  self.fivetran_log_query.get_column_lineage_query(
104
123
  connector_ids=connector_ids
@@ -116,7 +135,11 @@ class FivetranLogAPI:
116
135
  """
117
136
  Returns dict of table lineage metadata with key as 'CONNECTOR_ID'
118
137
  """
119
- connectors_table_lineage_metadata = defaultdict(list)
138
+ connectors_table_lineage_metadata: Dict[str, List] = defaultdict(list)
139
+
140
+ if not connector_ids:
141
+ return dict(connectors_table_lineage_metadata)
142
+
120
143
  table_lineage_result = self._query(
121
144
  self.fivetran_log_query.get_table_lineage_query(connector_ids=connector_ids)
122
145
  )
@@ -232,9 +255,15 @@ class FivetranLogAPI:
232
255
  return self._get_users().get(user_id)
233
256
 
234
257
  def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
235
- connector_ids = [connector.connector_id for connector in connectors]
236
- table_lineage_metadata = self._get_table_lineage_metadata(connector_ids)
237
- column_lineage_metadata = self._get_column_lineage_metadata(connector_ids)
258
+ # Create 2 filtered connector_ids lists - one for table lineage and one for column lineage
259
+ tll_connector_ids: List[str] = []
260
+ cll_connector_ids: List[str] = []
261
+ for connector in connectors:
262
+ tll_connector_ids.append(connector.connector_id)
263
+ if connector.connector_type not in DISABLE_COL_LINEAGE_FOR_CONNECTOR_TYPES:
264
+ cll_connector_ids.append(connector.connector_id)
265
+ table_lineage_metadata = self._get_table_lineage_metadata(tll_connector_ids)
266
+ column_lineage_metadata = self._get_column_lineage_metadata(cll_connector_ids)
238
267
  for connector in connectors:
239
268
  connector.lineage = self._extract_connector_lineage(
240
269
  table_lineage_result=table_lineage_metadata.get(connector.connector_id),
@@ -6,6 +6,21 @@ MAX_COLUMN_LINEAGE_PER_CONNECTOR = 1000
6
6
  MAX_JOBS_PER_CONNECTOR = 500
7
7
 
8
8
 
9
+ """
10
+ ------------------------------------------------------------------------------------------------------------
11
+ Fivetran Platform Connector Handling
12
+ ------------------------------------------------------------------------------------------------------------
13
+ Current Query Change Log: August 2025 (See: https://fivetran.com/docs/changelog/2025/august-2025)
14
+
15
+ All queries have to be updated as per Fivetran Platform Connector release if any. We expect customers
16
+ and fivetran to keep platform connector configured for DataHub with auto sync enabled to get latest changes.
17
+
18
+ References:
19
+ - Fivetran Release Notes: https://fivetran.com/docs/changelog (Look for "Fivetran Platform Connector")
20
+ - Latest Platform Connector Schema: https://fivetran.com/docs/logs/fivetran-platform?erdModal=open
21
+ """
22
+
23
+
9
24
  class FivetranLogQuery:
10
25
  # Note: All queries are written in Snowflake SQL.
11
26
  # They will be transpiled to the target database's SQL dialect at runtime.
@@ -18,22 +33,29 @@ class FivetranLogQuery:
18
33
  return f"use database {db_name}"
19
34
 
20
35
  def set_schema(self, schema_name: str) -> None:
21
- self.schema_clause = f"{schema_name}."
36
+ """
37
+ Using Snowflake quoted identifiers convention
38
+
39
+ Add double quotes around an identifier
40
+ Use two quotes to use the double quote character inside a quoted identifier
41
+ """
42
+ schema_name = schema_name.replace('"', '""')
43
+ self.schema_clause = f'"{schema_name}".'
22
44
 
23
45
  def get_connectors_query(self) -> str:
24
46
  return f"""\
25
47
  SELECT
26
- connector_id,
48
+ connection_id,
27
49
  connecting_user_id,
28
50
  connector_type_id,
29
- connector_name,
51
+ connection_name,
30
52
  paused,
31
53
  sync_frequency,
32
54
  destination_id
33
- FROM {self.schema_clause}connector
55
+ FROM {self.schema_clause}connection
34
56
  WHERE
35
57
  _fivetran_deleted = FALSE
36
- QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY _fivetran_synced DESC) = 1
58
+ QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY _fivetran_synced DESC) = 1
37
59
  """
38
60
 
39
61
  def get_users_query(self) -> str:
@@ -56,20 +78,20 @@ FROM {self.schema_clause}user
56
78
  return f"""\
57
79
  WITH ranked_syncs AS (
58
80
  SELECT
59
- connector_id,
81
+ connection_id,
60
82
  sync_id,
61
83
  MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time,
62
84
  MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
63
85
  MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
64
- ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
86
+ ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY MAX(time_stamp) DESC) as rn
65
87
  FROM {self.schema_clause}log
66
88
  WHERE message_event in ('sync_start', 'sync_end')
67
89
  AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
68
- AND connector_id IN ({formatted_connector_ids})
69
- GROUP BY connector_id, sync_id
90
+ AND connection_id IN ({formatted_connector_ids})
91
+ GROUP BY connection_id, sync_id
70
92
  )
71
93
  SELECT
72
- connector_id,
94
+ connection_id,
73
95
  sync_id,
74
96
  start_time,
75
97
  end_time,
@@ -78,7 +100,7 @@ FROM ranked_syncs
78
100
  WHERE rn <= {MAX_JOBS_PER_CONNECTOR}
79
101
  AND start_time IS NOT NULL
80
102
  AND end_time IS NOT NULL
81
- ORDER BY connector_id, end_time DESC
103
+ ORDER BY connection_id, end_time DESC
82
104
  """
83
105
 
84
106
  def get_table_lineage_query(self, connector_ids: List[str]) -> str:
@@ -90,7 +112,7 @@ SELECT
90
112
  *
91
113
  FROM (
92
114
  SELECT
93
- stm.connector_id as connector_id,
115
+ stm.connection_id as connection_id,
94
116
  stm.id as source_table_id,
95
117
  stm.name as source_table_name,
96
118
  ssm.name as source_schema_name,
@@ -98,18 +120,18 @@ FROM (
98
120
  dtm.name as destination_table_name,
99
121
  dsm.name as destination_schema_name,
100
122
  tl.created_at as created_at,
101
- ROW_NUMBER() OVER (PARTITION BY stm.connector_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn
123
+ ROW_NUMBER() OVER (PARTITION BY stm.connection_id, stm.id, dtm.id ORDER BY tl.created_at DESC) as table_combo_rn
102
124
  FROM {self.schema_clause}table_lineage as tl
103
- JOIN {self.schema_clause}source_table_metadata as stm on tl.source_table_id = stm.id
104
- JOIN {self.schema_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
105
- JOIN {self.schema_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
106
- JOIN {self.schema_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id
107
- WHERE stm.connector_id IN ({formatted_connector_ids})
125
+ JOIN {self.schema_clause}source_table as stm on tl.source_table_id = stm.id -- stm: source_table_metadata
126
+ JOIN {self.schema_clause}destination_table as dtm on tl.destination_table_id = dtm.id -- dtm: destination_table_metadata
127
+ JOIN {self.schema_clause}source_schema as ssm on stm.schema_id = ssm.id -- ssm: source_schema_metadata
128
+ JOIN {self.schema_clause}destination_schema as dsm on dtm.schema_id = dsm.id -- dsm: destination_schema_metadata
129
+ WHERE stm.connection_id IN ({formatted_connector_ids})
108
130
  )
109
131
  -- Ensure that we only get back one entry per source and destination pair.
110
132
  WHERE table_combo_rn = 1
111
- QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
112
- ORDER BY connector_id, created_at DESC
133
+ QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
134
+ ORDER BY connection_id, created_at DESC
113
135
  """
114
136
 
115
137
  def get_column_lineage_query(self, connector_ids: List[str]) -> str:
@@ -124,25 +146,25 @@ SELECT
124
146
  destination_column_name
125
147
  FROM (
126
148
  SELECT
127
- stm.connector_id as connector_id,
149
+ stm.connection_id as connection_id,
128
150
  scm.table_id as source_table_id,
129
151
  dcm.table_id as destination_table_id,
130
152
  scm.name as source_column_name,
131
153
  dcm.name as destination_column_name,
132
154
  cl.created_at as created_at,
133
- ROW_NUMBER() OVER (PARTITION BY stm.connector_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn
155
+ ROW_NUMBER() OVER (PARTITION BY stm.connection_id, cl.source_column_id, cl.destination_column_id ORDER BY cl.created_at DESC) as column_combo_rn
134
156
  FROM {self.schema_clause}column_lineage as cl
135
- JOIN {self.schema_clause}source_column_metadata as scm
157
+ JOIN {self.schema_clause}source_column as scm -- scm: source_column_metadata
136
158
  ON cl.source_column_id = scm.id
137
- JOIN {self.schema_clause}destination_column_metadata as dcm
159
+ JOIN {self.schema_clause}destination_column as dcm -- dcm: destination_column_metadata
138
160
  ON cl.destination_column_id = dcm.id
139
- -- Only joining source_table_metadata to get the connector_id.
140
- JOIN {self.schema_clause}source_table_metadata as stm
161
+ -- Only joining source_table to get the connection_id.
162
+ JOIN {self.schema_clause}source_table as stm -- stm: source_table_metadata
141
163
  ON scm.table_id = stm.id
142
- WHERE stm.connector_id IN ({formatted_connector_ids})
164
+ WHERE stm.connection_id IN ({formatted_connector_ids})
143
165
  )
144
166
  -- Ensure that we only get back one entry per (connector, source column, destination column) pair.
145
167
  WHERE column_combo_rn = 1
146
- QUALIFY ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
147
- ORDER BY connector_id, created_at DESC
168
+ QUALIFY ROW_NUMBER() OVER (PARTITION BY connection_id ORDER BY created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
169
+ ORDER BY connection_id, created_at DESC
148
170
  """
@@ -0,0 +1,65 @@
1
+ import logging
2
+
3
+ import requests
4
+ from requests.adapters import HTTPAdapter
5
+ from urllib3.util import Retry
6
+
7
+ from datahub.ingestion.source.fivetran.config import (
8
+ FivetranAPIConfig,
9
+ )
10
+ from datahub.ingestion.source.fivetran.response_models import FivetranConnectionDetails
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Retry configuration constants
15
+ RETRY_MAX_TIMES = 3
16
+ RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
17
+ RETRY_BACKOFF_FACTOR = 1
18
+ RETRY_ALLOWED_METHODS = ["GET"]
19
+
20
+
21
+ class FivetranAPIClient:
22
+ """Client for interacting with the Fivetran REST API."""
23
+
24
+ def __init__(self, config: FivetranAPIConfig) -> None:
25
+ self.config = config
26
+ self._session = self._create_session()
27
+
28
+ def _create_session(self) -> requests.Session:
29
+ """
30
+ Create a session with retry logic and basic authentication
31
+ """
32
+ requests_session = requests.Session()
33
+
34
+ # Configure retry strategy for transient failures
35
+ retry_strategy = Retry(
36
+ total=RETRY_MAX_TIMES,
37
+ backoff_factor=RETRY_BACKOFF_FACTOR,
38
+ status_forcelist=RETRY_STATUS_CODES,
39
+ allowed_methods=RETRY_ALLOWED_METHODS,
40
+ raise_on_status=True,
41
+ )
42
+
43
+ adapter = HTTPAdapter(max_retries=retry_strategy)
44
+ requests_session.mount("http://", adapter)
45
+ requests_session.mount("https://", adapter)
46
+
47
+ # Set up basic authentication
48
+ requests_session.auth = (self.config.api_key, self.config.api_secret)
49
+ requests_session.headers.update(
50
+ {
51
+ "Content-Type": "application/json",
52
+ "Accept": "application/json",
53
+ }
54
+ )
55
+ return requests_session
56
+
57
+ def get_connection_details_by_id(
58
+ self, connection_id: str
59
+ ) -> FivetranConnectionDetails:
60
+ """Get details for a specific connection."""
61
+ connection_details = self._session.get(
62
+ f"{self.config.base_url}/v1/connections/{connection_id}",
63
+ timeout=self.config.request_timeout_sec,
64
+ )
65
+ return FivetranConnectionDetails(**connection_details.json().get("data", {}))
@@ -0,0 +1,97 @@
1
+ import datetime
2
+ from typing import Dict, List
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class FivetranConnectionWarnings(BaseModel):
8
+ code: str # Warning Code
9
+ message: str # Warning Message
10
+ details: Dict # Warning Details
11
+
12
+
13
+ class FivetranConnectionStatus(BaseModel):
14
+ setup_state: str # Setup State
15
+ schema_status: str # Schema Status
16
+ sync_state: str # Sync State
17
+ update_state: str # Update State
18
+ is_historical_sync: bool # Is Historical Sync
19
+ warnings: List[FivetranConnectionWarnings] # Warnings
20
+
21
+
22
+ class FivetranConnectionConfig(BaseModel):
23
+ # Note: Connection Config is different for different connectors
24
+ auth_type: str # Auth Type
25
+ sheet_id: str # Sheet ID - URL to the Google Sheet
26
+ named_range: str # Named Range
27
+
28
+
29
+ class FivetranConnectionSourceSyncDetails(BaseModel):
30
+ last_synced: datetime.datetime # Last Synced
31
+
32
+
33
+ class FivetranConnectionDetails(BaseModel):
34
+ """
35
+ Note: This reponse class only captures fields that are relevant to the Google Sheets Connector
36
+ """
37
+
38
+ id: str # Source ID
39
+ group_id: str # Destination ID
40
+ service: str # Connector Type
41
+ created_at: datetime.datetime
42
+ succeeded_at: datetime.datetime
43
+ paused: bool # Paused Status
44
+ sync_frequency: int # Sync Frequency (minutes)
45
+ status: FivetranConnectionStatus # Status
46
+ config: FivetranConnectionConfig # Connection Config
47
+ source_sync_details: FivetranConnectionSourceSyncDetails # Source Sync Details
48
+
49
+ """
50
+ # Sample Response for Google Sheets Connector
51
+ {
52
+ "code": "Success",
53
+ "data": {
54
+ "id": "dialectical_remindful",
55
+ "group_id": "empties_classification",
56
+ "service": "google_sheets",
57
+ "service_version": 1,
58
+ "schema": "fivetran_google_sheets.fivetran_google_sheets",
59
+ "connected_by": "sewn_restrained",
60
+ "created_at": "2025-10-06T17:53:01.554289Z",
61
+ "succeeded_at": "2025-10-06T22:55:45.275000Z",
62
+ "failed_at": null,
63
+ "paused": true,
64
+ "pause_after_trial": false,
65
+ "sync_frequency": 360,
66
+ "data_delay_threshold": 0,
67
+ "data_delay_sensitivity": "NORMAL",
68
+ "private_link_id": null,
69
+ "networking_method": "Directly",
70
+ "proxy_agent_id": null,
71
+ "schedule_type": "auto",
72
+ "status": {
73
+ "setup_state": "connected",
74
+ "schema_status": "ready",
75
+ "sync_state": "paused",
76
+ "update_state": "on_schedule",
77
+ "is_historical_sync": false,
78
+ "tasks": [],
79
+ "warnings": [
80
+ {
81
+ "code": "snowflake_discontinuing_password_auth",
82
+ "message": "Snowflake is discontinuing username/password authentication",
83
+ "details": {}
84
+ }
85
+ ]
86
+ },
87
+ "config": {
88
+ "auth_type": "ServiceAccount",
89
+ "sheet_id": "https://docs.google.com/spreadsheets/d/1A82PdLAE7NXLLb5JcLPKeIpKUMytXQba5Z-Ei-mbXLo/edit?gid=0#gid=0",
90
+ "named_range": "Fivetran_Test_Range"
91
+ },
92
+ "source_sync_details": {
93
+ "last_synced": "2025-10-06T22:55:27.371Z"
94
+ }
95
+ }
96
+ }
97
+ """
@@ -34,7 +34,6 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
34
34
  SoftDeletedEntitiesCleanupConfig,
35
35
  SoftDeletedEntitiesReport,
36
36
  )
37
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
38
37
 
39
38
  logger = logging.getLogger(__name__)
40
39
 
@@ -87,7 +86,6 @@ class DataHubGcSourceReport(
87
86
  DataProcessCleanupReport,
88
87
  SoftDeletedEntitiesReport,
89
88
  DatahubExecutionRequestCleanupReport,
90
- IngestionStageReport,
91
89
  ):
92
90
  expired_tokens_revoked: int = 0
93
91
 
@@ -16,6 +16,7 @@ from datahub.ingestion.api.decorators import (
16
16
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
17
17
  from datahub.ingestion.api.workunit import MetadataWorkUnit
18
18
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
19
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
19
20
  from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
20
21
  from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
21
22
  from datahub.ingestion.source.data_lake_common.object_store import (
@@ -36,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
36
37
 
37
38
  logger: logging.Logger = logging.getLogger(__name__)
38
39
 
40
+ GCS_ENDPOINT_URL = "https://storage.googleapis.com"
41
+
39
42
 
40
43
  class HMACKey(ConfigModel):
41
44
  hmac_access_id: str = Field(description="Access ID")
@@ -82,7 +85,14 @@ class GCSSourceReport(DataLakeSourceReport):
82
85
  @platform_name("Google Cloud Storage", id=PLATFORM_GCS)
83
86
  @config_class(GCSSourceConfig)
84
87
  @support_status(SupportStatus.INCUBATING)
85
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
88
+ @capability(
89
+ SourceCapability.CONTAINERS,
90
+ "Enabled by default",
91
+ subtype_modifier=[
92
+ SourceCapabilityModifier.GCS_BUCKET,
93
+ SourceCapabilityModifier.FOLDER,
94
+ ],
95
+ )
86
96
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
87
97
  @capability(SourceCapability.DATA_PROFILING, "Not supported", supported=False)
88
98
  class GCSSource(StatefulIngestionSourceBase):
@@ -104,7 +114,7 @@ class GCSSource(StatefulIngestionSourceBase):
104
114
  s3_config = DataLakeSourceConfig(
105
115
  path_specs=s3_path_specs,
106
116
  aws_config=AwsConnectionConfig(
107
- aws_endpoint_url="https://storage.googleapis.com",
117
+ aws_endpoint_url=GCS_ENDPOINT_URL,
108
118
  aws_access_key_id=self.config.credential.hmac_access_id,
109
119
  aws_secret_access_key=self.config.credential.hmac_access_secret.get_secret_value(),
110
120
  aws_region="auto",
@@ -112,15 +122,26 @@ class GCSSource(StatefulIngestionSourceBase):
112
122
  env=self.config.env,
113
123
  max_rows=self.config.max_rows,
114
124
  number_of_files_to_sample=self.config.number_of_files_to_sample,
125
+ platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
126
+ platform_instance=self.config.platform_instance,
115
127
  )
116
128
  return s3_config
117
129
 
118
130
  def create_equivalent_s3_path_specs(self):
119
131
  s3_path_specs = []
120
132
  for path_spec in self.config.path_specs:
133
+ # PathSpec modifies the passed-in include to add /** to the end if
134
+ # autodetecting partitions. Remove that, otherwise creating a new
135
+ # PathSpec will complain.
136
+ # TODO: this should be handled inside PathSpec, which probably shouldn't
137
+ # modify its input.
138
+ include = path_spec.include
139
+ if include.endswith("{table}/**") and not path_spec.allow_double_stars:
140
+ include = include.removesuffix("**")
141
+
121
142
  s3_path_specs.append(
122
143
  PathSpec(
123
- include=path_spec.include.replace("gs://", "s3://"),
144
+ include=include.replace("gs://", "s3://"),
124
145
  exclude=(
125
146
  [exc.replace("gs://", "s3://") for exc in path_spec.exclude]
126
147
  if path_spec.exclude
@@ -131,6 +152,11 @@ class GCSSource(StatefulIngestionSourceBase):
131
152
  table_name=path_spec.table_name,
132
153
  enable_compression=path_spec.enable_compression,
133
154
  sample_files=path_spec.sample_files,
155
+ allow_double_stars=path_spec.allow_double_stars,
156
+ autodetect_partitions=path_spec.autodetect_partitions,
157
+ include_hidden_folders=path_spec.include_hidden_folders,
158
+ tables_filter_pattern=path_spec.tables_filter_pattern,
159
+ traversal_method=path_spec.traversal_method,
134
160
  )
135
161
  )
136
162
 
@@ -138,7 +164,9 @@ class GCSSource(StatefulIngestionSourceBase):
138
164
 
139
165
  def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
140
166
  config = self.create_equivalent_s3_config()
141
- s3_source = S3Source(config, PipelineContext(ctx.run_id))
167
+ # Create a new context for S3 source without graph to avoid duplicate checkpointer registration
168
+ s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
169
+ s3_source = S3Source(config, s3_ctx)
142
170
  return self.s3_source_overrides(s3_source)
143
171
 
144
172
  def s3_source_overrides(self, source: S3Source) -> S3Source: