acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
12
12
  from datahub.ingestion.api.common import PipelineContext
13
13
  from datahub.ingestion.api.source import SourceReport
14
14
  from datahub.ingestion.graph.client import DataHubGraph
15
- from datahub.ingestion.graph.filters import RemovedStatusFilter
15
+ from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
16
16
  from datahub.utilities.lossy_collections import LossyList
17
17
  from datahub.utilities.stats_collections import TopKDict
18
18
  from datahub.utilities.urns._urn_base import Urn
19
+ from datahub.utilities.urns.error import InvalidUrnError
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
22
- QUERY_ENTITIES = """
23
- query listEntities($input: ScrollAcrossEntitiesInput!) {
24
- scrollAcrossEntities(input: $input) {
25
- nextScrollId
26
- count
27
- searchResults {
28
- entity {
29
- ... on QueryEntity {
30
- urn
31
- }
32
- ... on DataProcessInstance {
33
- urn
34
- }
35
- }
36
- }
37
- }
38
- }
39
- """
40
-
41
23
 
42
24
  class SoftDeletedEntitiesCleanupConfig(ConfigModel):
43
25
  enabled: bool = Field(
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
64
46
  )
65
47
 
66
48
  entity_types: Optional[List[str]] = Field(
67
- default=None,
49
+ # A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
50
+ default=[
51
+ "dataset",
52
+ "dashboard",
53
+ "chart",
54
+ "mlmodel",
55
+ "mlmodelGroup",
56
+ "mlfeatureTable",
57
+ "mlfeature",
58
+ "mlprimaryKey",
59
+ "dataFlow",
60
+ "dataJob",
61
+ "glossaryTerm",
62
+ "glossaryNode",
63
+ "tag",
64
+ "role",
65
+ "corpuser",
66
+ "corpGroup",
67
+ "container",
68
+ "domain",
69
+ "dataProduct",
70
+ "notebook",
71
+ "businessAttribute",
72
+ "schemaField",
73
+ "query",
74
+ "dataProcessInstance",
75
+ ],
68
76
  description="List of entity types to cleanup",
69
77
  )
70
78
 
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
103
111
  num_entities_found: Dict[str, int] = field(default_factory=dict)
104
112
  num_soft_deleted_entity_processed: int = 0
105
113
  num_soft_deleted_retained_due_to_age: int = 0
114
+ num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
115
+ default_factory=TopKDict
116
+ )
106
117
  num_soft_deleted_entity_removal_started: int = 0
107
118
  num_hard_deleted: int = 0
108
119
  num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
111
122
  )
112
123
  runtime_limit_reached: bool = False
113
124
  deletion_limit_reached: bool = False
125
+ num_soft_deleted_entity_found: int = 0
126
+ num_soft_deleted_entity_invalid_urn: int = 0
114
127
 
115
128
 
116
129
  class SoftDeletedEntitiesCleanup:
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
133
146
  self.config = config
134
147
  self.report = report
135
148
  self.dry_run = dry_run
136
- self.start_time = 0.0
149
+ self.start_time = time.time()
137
150
  self._report_lock: Lock = Lock()
138
151
  self.last_print_time = 0.0
139
152
 
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
142
155
  with self._report_lock:
143
156
  self.report.num_soft_deleted_retained_due_to_age += 1
144
157
 
158
+ def _increment_retained_by_type(self, type: str) -> None:
159
+ """Thread-safe method to update report fields"""
160
+ with self._report_lock:
161
+ self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
162
+ self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
163
+ + 1
164
+ )
165
+
145
166
  def _increment_removal_started_count(self) -> None:
146
167
  """Thread-safe method to update report fields"""
147
168
  with self._report_lock:
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
160
181
  )
161
182
  self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
162
183
 
163
- def delete_entity(self, urn: str) -> None:
184
+ def delete_entity(self, urn: Urn) -> None:
164
185
  assert self.ctx.graph
165
186
 
166
- entity_urn = Urn.from_string(urn)
167
187
  if self.dry_run:
168
188
  logger.info(
169
189
  f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
172
192
  if self._deletion_limit_reached() or self._times_up():
173
193
  return
174
194
  self._increment_removal_started_count()
175
- self.ctx.graph.delete_entity(urn=urn, hard=True)
195
+ self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
176
196
  self.ctx.graph.delete_references_to_urn(
177
- urn=urn,
197
+ urn=urn.urn(),
178
198
  dry_run=False,
179
199
  )
180
- self._update_report(urn, entity_urn.entity_type)
200
+ self._update_report(urn.urn(), urn.entity_type)
181
201
 
182
- def delete_soft_deleted_entity(self, urn: str) -> None:
202
+ def delete_soft_deleted_entity(self, urn: Urn) -> None:
183
203
  assert self.ctx.graph
184
204
 
185
205
  retention_time = (
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
187
207
  - self.config.retention_days * 24 * 60 * 60
188
208
  )
189
209
 
190
- aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
210
+ aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
191
211
  if "status" in aspect["aspects"]:
192
212
  if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
193
213
  "status"
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
196
216
  self.delete_entity(urn)
197
217
  else:
198
218
  self._increment_retained_count()
219
+ self._increment_retained_by_type(urn.entity_type)
199
220
 
200
221
  def _print_report(self) -> None:
201
222
  time_taken = round(time.time() - self.last_print_time, 1)
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
204
225
  self.last_print_time = time.time()
205
226
  logger.info(f"\n{self.report.as_string()}")
206
227
 
207
- def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
228
+ def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
208
229
  done, not_done = wait(futures, return_when=FIRST_COMPLETED)
209
230
  futures = {future: urn for future, urn in futures.items() if future in not_done}
210
231
 
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
214
235
  self.report.failure(
215
236
  title="Failed to delete entity",
216
237
  message="Failed to delete entity",
217
- context=futures[future],
238
+ context=futures[future].urn(),
218
239
  exc=future.exception(),
219
240
  )
220
241
  self.report.num_soft_deleted_entity_processed += 1
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
229
250
  time.sleep(self.config.delay)
230
251
  return futures
231
252
 
232
- def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
253
+ def _get_urns(self) -> Iterable[str]:
233
254
  assert self.ctx.graph
234
- scroll_id: Optional[str] = None
235
-
236
- batch_size = self.config.batch_size
237
- if entity_type == "DATA_PROCESS_INSTANCE":
238
- # Due to a bug in Data process instance querying this is a temp workaround
239
- # to avoid a giant stacktrace by having a smaller batch size in first call
240
- # This will be remove in future version after server with fix has been
241
- # around for a while
242
- batch_size = 10
243
-
244
- while True:
245
- try:
246
- if entity_type not in self.report.num_calls_made:
247
- self.report.num_calls_made[entity_type] = 1
248
- else:
249
- self.report.num_calls_made[entity_type] += 1
250
- self._print_report()
251
- result = self.ctx.graph.execute_graphql(
252
- graphql_query,
253
- {
254
- "input": {
255
- "types": [entity_type],
256
- "query": "*",
257
- "scrollId": scroll_id if scroll_id else None,
258
- "count": batch_size,
259
- "orFilters": [
260
- {
261
- "and": [
262
- {
263
- "field": "removed",
264
- "values": ["true"],
265
- "condition": "EQUAL",
266
- }
267
- ]
268
- }
269
- ],
270
- }
271
- },
272
- )
273
- except Exception as e:
274
- self.report.failure(
275
- f"While trying to get {entity_type} with {scroll_id}", exc=e
276
- )
277
- break
278
- scroll_across_entities = result.get("scrollAcrossEntities")
279
- if not scroll_across_entities:
280
- break
281
- search_results = scroll_across_entities.get("searchResults")
282
- count = scroll_across_entities.get("count")
283
- if not count or not search_results:
284
- # Due to a server bug we cannot rely on just count as it was returning response like this
285
- # {'count': 1, 'nextScrollId': None, 'searchResults': []}
286
- break
287
- if entity_type == "DATA_PROCESS_INSTANCE":
288
- # Temp workaround. See note in beginning of the function
289
- # We make the batch size = config after call has succeeded once
290
- batch_size = self.config.batch_size
291
- scroll_id = scroll_across_entities.get("nextScrollId")
292
- if entity_type not in self.report.num_entities_found:
293
- self.report.num_entities_found[entity_type] = 0
294
- self.report.num_entities_found[entity_type] += scroll_across_entities.get(
295
- "count"
255
+ # Entities created in the retention period are not considered for deletion
256
+ created_from = int(
257
+ (
258
+ datetime.now(timezone.utc).timestamp()
259
+ - self.config.retention_days * 24 * 60 * 60
296
260
  )
297
- for query in search_results:
298
- yield query["entity"]["urn"]
261
+ * 1000
262
+ )
263
+
264
+ entity_types = self.config.entity_types
265
+ # dataProcessInstance is a special case where we need to get the entities separately
266
+ # because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
267
+ # Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
268
+ if (
269
+ self.config.entity_types
270
+ and "dataProcessInstance" in self.config.entity_types
271
+ ):
272
+ entity_types = self.config.entity_types.copy()
273
+ yield from self.ctx.graph.get_urns_by_filter(
274
+ entity_types=["dataProcessInstance"],
275
+ platform=self.config.platform,
276
+ env=self.config.env,
277
+ query=self.config.query,
278
+ status=RemovedStatusFilter.ONLY_SOFT_DELETED,
279
+ batch_size=self.config.batch_size,
280
+ extraFilters=[
281
+ SearchFilterRule(
282
+ field="created",
283
+ condition="LESS_THAN",
284
+ values=[f"{created_from}"],
285
+ ).to_raw()
286
+ ],
287
+ )
288
+
289
+ entity_types.remove("dataProcessInstance")
299
290
 
300
- def _get_urns(self) -> Iterable[str]:
301
- assert self.ctx.graph
302
291
  yield from self.ctx.graph.get_urns_by_filter(
303
- entity_types=self.config.entity_types,
292
+ entity_types=entity_types,
304
293
  platform=self.config.platform,
305
294
  env=self.config.env,
306
295
  query=self.config.query,
307
296
  status=RemovedStatusFilter.ONLY_SOFT_DELETED,
308
297
  batch_size=self.config.batch_size,
309
298
  )
310
- yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
311
- yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
312
299
 
313
300
  def _times_up(self) -> bool:
314
301
  if (
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
335
322
  return
336
323
  self.start_time = time.time()
337
324
 
338
- futures: Dict[Future, str] = dict()
325
+ futures: Dict[Future, Urn] = dict()
339
326
  with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
340
327
  for urn in self._get_urns():
328
+ try:
329
+ self.report.num_soft_deleted_entity_found += 1
330
+ soft_deleted_urn = Urn.from_string(urn)
331
+ except InvalidUrnError as e:
332
+ logger.error(f"Failed to parse urn {urn} with error {e}")
333
+ self.report.num_soft_deleted_entity_invalid_urn += 1
334
+ continue
335
+
341
336
  self._print_report()
342
337
  while len(futures) >= self.config.futures_max_at_time:
343
338
  futures = self._process_futures(futures)
344
339
  if self._deletion_limit_reached() or self._times_up():
345
340
  break
346
- future = executor.submit(self.delete_soft_deleted_entity, urn)
347
- futures[future] = urn
341
+ future = executor.submit(
342
+ self.delete_soft_deleted_entity, soft_deleted_urn
343
+ )
344
+ futures[future] = soft_deleted_urn
348
345
 
349
346
  logger.info(f"Waiting for {len(futures)} futures to complete")
350
347
  while len(futures) > 0:
@@ -1,6 +1,5 @@
1
1
  import logging
2
2
  from typing import Dict, Iterable, List, Optional
3
- from urllib.parse import unquote
4
3
 
5
4
  from pydantic import Field, SecretStr, validator
6
5
 
@@ -17,8 +16,12 @@ from datahub.ingestion.api.decorators import (
17
16
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability
18
17
  from datahub.ingestion.api.workunit import MetadataWorkUnit
19
18
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
19
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
20
20
  from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
21
21
  from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
22
+ from datahub.ingestion.source.data_lake_common.object_store import (
23
+ create_object_store_adapter,
24
+ )
22
25
  from datahub.ingestion.source.data_lake_common.path_spec import PathSpec, is_gcs_uri
23
26
  from datahub.ingestion.source.s3.config import DataLakeSourceConfig
24
27
  from datahub.ingestion.source.s3.report import DataLakeSourceReport
@@ -34,6 +37,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
34
37
 
35
38
  logger: logging.Logger = logging.getLogger(__name__)
36
39
 
40
+ GCS_ENDPOINT_URL = "https://storage.googleapis.com"
41
+
37
42
 
38
43
  class HMACKey(ConfigModel):
39
44
  hmac_access_id: str = Field(description="Access ID")
@@ -80,7 +85,14 @@ class GCSSourceReport(DataLakeSourceReport):
80
85
  @platform_name("Google Cloud Storage", id=PLATFORM_GCS)
81
86
  @config_class(GCSSourceConfig)
82
87
  @support_status(SupportStatus.INCUBATING)
83
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
88
+ @capability(
89
+ SourceCapability.CONTAINERS,
90
+ "Enabled by default",
91
+ subtype_modifier=[
92
+ SourceCapabilityModifier.GCS_BUCKET,
93
+ SourceCapabilityModifier.FOLDER,
94
+ ],
95
+ )
84
96
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
85
97
  @capability(SourceCapability.DATA_PROFILING, "Not supported", supported=False)
86
98
  class GCSSource(StatefulIngestionSourceBase):
@@ -102,7 +114,7 @@ class GCSSource(StatefulIngestionSourceBase):
102
114
  s3_config = DataLakeSourceConfig(
103
115
  path_specs=s3_path_specs,
104
116
  aws_config=AwsConnectionConfig(
105
- aws_endpoint_url="https://storage.googleapis.com",
117
+ aws_endpoint_url=GCS_ENDPOINT_URL,
106
118
  aws_access_key_id=self.config.credential.hmac_access_id,
107
119
  aws_secret_access_key=self.config.credential.hmac_access_secret.get_secret_value(),
108
120
  aws_region="auto",
@@ -110,15 +122,26 @@ class GCSSource(StatefulIngestionSourceBase):
110
122
  env=self.config.env,
111
123
  max_rows=self.config.max_rows,
112
124
  number_of_files_to_sample=self.config.number_of_files_to_sample,
125
+ platform=PLATFORM_GCS, # Ensure GCS platform is used for correct container subtypes
126
+ platform_instance=self.config.platform_instance,
113
127
  )
114
128
  return s3_config
115
129
 
116
130
  def create_equivalent_s3_path_specs(self):
117
131
  s3_path_specs = []
118
132
  for path_spec in self.config.path_specs:
133
+ # PathSpec modifies the passed-in include to add /** to the end if
134
+ # autodetecting partitions. Remove that, otherwise creating a new
135
+ # PathSpec will complain.
136
+ # TODO: this should be handled inside PathSpec, which probably shouldn't
137
+ # modify its input.
138
+ include = path_spec.include
139
+ if include.endswith("{table}/**") and not path_spec.allow_double_stars:
140
+ include = include.removesuffix("**")
141
+
119
142
  s3_path_specs.append(
120
143
  PathSpec(
121
- include=path_spec.include.replace("gs://", "s3://"),
144
+ include=include.replace("gs://", "s3://"),
122
145
  exclude=(
123
146
  [exc.replace("gs://", "s3://") for exc in path_spec.exclude]
124
147
  if path_spec.exclude
@@ -129,6 +152,11 @@ class GCSSource(StatefulIngestionSourceBase):
129
152
  table_name=path_spec.table_name,
130
153
  enable_compression=path_spec.enable_compression,
131
154
  sample_files=path_spec.sample_files,
155
+ allow_double_stars=path_spec.allow_double_stars,
156
+ autodetect_partitions=path_spec.autodetect_partitions,
157
+ include_hidden_folders=path_spec.include_hidden_folders,
158
+ tables_filter_pattern=path_spec.tables_filter_pattern,
159
+ traversal_method=path_spec.traversal_method,
132
160
  )
133
161
  )
134
162
 
@@ -136,16 +164,31 @@ class GCSSource(StatefulIngestionSourceBase):
136
164
 
137
165
  def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
138
166
  config = self.create_equivalent_s3_config()
139
- return self.s3_source_overrides(S3Source(config, PipelineContext(ctx.run_id)))
167
+ # Create a new context for S3 source without graph to avoid duplicate checkpointer registration
168
+ s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
169
+ s3_source = S3Source(config, s3_ctx)
170
+ return self.s3_source_overrides(s3_source)
140
171
 
141
172
  def s3_source_overrides(self, source: S3Source) -> S3Source:
142
- source.source_config.platform = PLATFORM_GCS
173
+ """
174
+ Override S3Source methods with GCS-specific implementations using the adapter pattern.
143
175
 
144
- source.is_s3_platform = lambda: True # type: ignore
145
- source.create_s3_path = lambda bucket_name, key: unquote( # type: ignore
146
- f"s3://{bucket_name}/{key}"
176
+ This method customizes the S3Source instance to behave like a GCS source by
177
+ applying the GCS-specific adapter that replaces the necessary functionality.
178
+
179
+ Args:
180
+ source: The S3Source instance to customize
181
+
182
+ Returns:
183
+ The modified S3Source instance with GCS behavior
184
+ """
185
+ # Create a GCS adapter with project ID and region from our config
186
+ adapter = create_object_store_adapter(
187
+ "gcs",
147
188
  )
148
- return source
189
+
190
+ # Apply all customizations to the source
191
+ return adapter.apply_customizations(source)
149
192
 
150
193
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
151
194
  return [
@@ -4,35 +4,62 @@ GCS_PREFIX = "gs://"
4
4
 
5
5
 
6
6
  def is_gcs_uri(uri: str) -> bool:
7
+ """
8
+ Check if a URI is a GCS URI (starts with gs://).
9
+
10
+ For more general URI handling, consider using object_store.get_object_store_for_uri.
11
+ """
7
12
  return uri.startswith(GCS_PREFIX)
8
13
 
9
14
 
10
15
  def get_gcs_prefix(gcs_uri: str) -> Optional[str]:
16
+ """
17
+ Get the GCS prefix (gs://) if the URI is a GCS URI.
18
+
19
+ For more general URI handling, consider using object_store.get_object_store_for_uri.
20
+ """
11
21
  if gcs_uri.startswith(GCS_PREFIX):
12
22
  return GCS_PREFIX
13
23
  return None
14
24
 
15
25
 
16
26
  def strip_gcs_prefix(gcs_uri: str) -> str:
17
- # remove GCS prefix (gs://)
27
+ """
28
+ Remove the GCS prefix (gs://) from a GCS URI.
29
+
30
+ For more general URI handling, consider using the object_store module.
31
+
32
+ Args:
33
+ gcs_uri: A GCS URI starting with gs://
34
+
35
+ Returns:
36
+ The URI without the gs:// prefix
37
+
38
+ Raises:
39
+ ValueError: If the URI doesn't start with gs://
40
+ """
18
41
  prefix = get_gcs_prefix(gcs_uri)
19
42
  if not prefix:
20
- raise ValueError(f"Not an GCS URI. Must start with prefix: {GCS_PREFIX}")
43
+ raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
21
44
 
22
45
  return gcs_uri[len(GCS_PREFIX) :]
23
46
 
24
47
 
25
- def get_gcs_bucket_name(path):
26
- if not is_gcs_uri(path):
27
- raise ValueError(f"Not a GCS URI. Must start with prefixe: {GCS_PREFIX}")
28
- return strip_gcs_prefix(path).split("/")[0]
29
-
30
-
31
48
  def get_gcs_bucket_relative_path(gcs_uri: str) -> str:
49
+ """
50
+ Get the path relative to the bucket from a GCS URI.
51
+
52
+ For more general URI handling, consider using object_store.get_object_key.
53
+ """
32
54
  return "/".join(strip_gcs_prefix(gcs_uri).split("/")[1:])
33
55
 
34
56
 
35
57
  def get_gcs_key_prefix(gcs_uri: str) -> str:
58
+ """
59
+ Get the key prefix (first path component after bucket) from a GCS URI.
60
+
61
+ For more general URI handling, consider using object_store.get_object_key.
62
+ """
36
63
  if not is_gcs_uri(gcs_uri):
37
- raise ValueError(f"Not a GCS URI. Must start with prefixe: {GCS_PREFIX}")
64
+ raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
38
65
  return strip_gcs_prefix(gcs_uri).split("/", maxsplit=1)[1]