acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,732 @@
1
+ import re
2
+ from abc import ABC, abstractmethod
3
+
4
+ # Add imports for source customization
5
+ from typing import Any, Callable, Dict, Optional, Type, TypeVar
6
+ from urllib.parse import unquote
7
+
8
+ # Don't import TableData at the module level to avoid circular imports
9
+ # from datahub.ingestion.source.s3.source import TableData
10
+
11
+ T = TypeVar("T")
12
+
13
+
14
+ class ObjectStoreInterface(ABC):
15
+ """
16
+ Abstract interface for object store operations.
17
+
18
+ This interface defines the operations that any object store connector
19
+ (S3, GCS, ABS, etc.) should implement to provide a consistent API.
20
+ """
21
+
22
+ @classmethod
23
+ @abstractmethod
24
+ def is_uri(cls, uri: str) -> bool:
25
+ """
26
+ Check if the given URI is for this object store.
27
+
28
+ Args:
29
+ uri: The URI to check
30
+
31
+ Returns:
32
+ True if the URI is for this object store, False otherwise
33
+ """
34
+ pass
35
+
36
+ @classmethod
37
+ @abstractmethod
38
+ def get_prefix(cls, uri: str) -> Optional[str]:
39
+ """
40
+ Get the prefix for this object store URI (e.g., 's3://', 'gs://').
41
+
42
+ Args:
43
+ uri: The URI to get the prefix from
44
+
45
+ Returns:
46
+ The prefix if the URI starts with it, None otherwise
47
+ """
48
+ pass
49
+
50
+ @classmethod
51
+ @abstractmethod
52
+ def strip_prefix(cls, uri: str) -> str:
53
+ """
54
+ Remove the object store prefix from the URI.
55
+
56
+ Args:
57
+ uri: The URI to strip the prefix from
58
+
59
+ Returns:
60
+ The URI without the prefix
61
+
62
+ Raises:
63
+ ValueError: If the URI does not start with the expected prefix
64
+ """
65
+ pass
66
+
67
+ @classmethod
68
+ @abstractmethod
69
+ def get_bucket_name(cls, uri: str) -> str:
70
+ """
71
+ Get the bucket name from the URI.
72
+
73
+ Args:
74
+ uri: The URI to get the bucket name from
75
+
76
+ Returns:
77
+ The bucket name
78
+
79
+ Raises:
80
+ ValueError: If the URI is not valid for this object store
81
+ """
82
+ pass
83
+
84
+ @classmethod
85
+ @abstractmethod
86
+ def get_object_key(cls, uri: str) -> str:
87
+ """
88
+ Get the object key/path (excluding the bucket) from the URI.
89
+
90
+ Args:
91
+ uri: The URI to get the object key from
92
+
93
+ Returns:
94
+ The object key
95
+
96
+ Raises:
97
+ ValueError: If the URI is not valid for this object store
98
+ """
99
+ pass
100
+
101
+ @classmethod
102
+ def get_object_store_bucket_name(cls, uri: str) -> str:
103
+ """
104
+ Get the bucket name from the URI, handling foreign URIs if supported.
105
+
106
+ The default implementation just calls get_bucket_name, but subclasses
107
+ can override this to handle URIs from other object stores.
108
+
109
+ Args:
110
+ uri: The URI to get the bucket name from
111
+
112
+ Returns:
113
+ The bucket name
114
+
115
+ Raises:
116
+ ValueError: If the URI is not supported
117
+ """
118
+ return cls.get_bucket_name(uri)
119
+
120
+
121
+ class S3ObjectStore(ObjectStoreInterface):
122
+ """Implementation of ObjectStoreInterface for Amazon S3."""
123
+
124
+ PREFIXES = ["s3://", "s3n://", "s3a://"]
125
+
126
+ @classmethod
127
+ def is_uri(cls, uri: str) -> bool:
128
+ return any(uri.startswith(prefix) for prefix in cls.PREFIXES)
129
+
130
+ @classmethod
131
+ def get_prefix(cls, uri: str) -> Optional[str]:
132
+ for prefix in cls.PREFIXES:
133
+ if uri.startswith(prefix):
134
+ return prefix
135
+ return None
136
+
137
+ @classmethod
138
+ def strip_prefix(cls, uri: str) -> str:
139
+ prefix = cls.get_prefix(uri)
140
+ if not prefix:
141
+ raise ValueError(
142
+ f"Not an S3 URI. Must start with one of the following prefixes: {str(cls.PREFIXES)}"
143
+ )
144
+ return uri[len(prefix) :]
145
+
146
+ @classmethod
147
+ def get_bucket_name(cls, uri: str) -> str:
148
+ if not cls.is_uri(uri):
149
+ raise ValueError(
150
+ f"Not an S3 URI. Must start with one of the following prefixes: {str(cls.PREFIXES)}"
151
+ )
152
+ return cls.strip_prefix(uri).split("/")[0]
153
+
154
+ @classmethod
155
+ def get_object_key(cls, uri: str) -> str:
156
+ if not cls.is_uri(uri):
157
+ raise ValueError(
158
+ f"Not an S3 URI. Must start with one of the following prefixes: {str(cls.PREFIXES)}"
159
+ )
160
+ parts = cls.strip_prefix(uri).split("/", 1)
161
+ if len(parts) < 2:
162
+ return ""
163
+ return parts[1]
164
+
165
+ @classmethod
166
+ def get_object_store_bucket_name(cls, uri: str) -> str:
167
+ """
168
+ Get the bucket name from an S3 URI.
169
+
170
+ Args:
171
+ uri: The URI to get the bucket name from
172
+
173
+ Returns:
174
+ The bucket name
175
+
176
+ Raises:
177
+ ValueError: If the URI is not an S3 URI
178
+ """
179
+ return cls.get_bucket_name(uri)
180
+
181
+
182
+ class GCSObjectStore(ObjectStoreInterface):
183
+ """Implementation of ObjectStoreInterface for Google Cloud Storage."""
184
+
185
+ PREFIX = "gs://"
186
+
187
+ @classmethod
188
+ def is_uri(cls, uri: str) -> bool:
189
+ return uri.startswith(cls.PREFIX)
190
+
191
+ @classmethod
192
+ def get_prefix(cls, uri: str) -> Optional[str]:
193
+ if uri.startswith(cls.PREFIX):
194
+ return cls.PREFIX
195
+ return None
196
+
197
+ @classmethod
198
+ def strip_prefix(cls, uri: str) -> str:
199
+ prefix = cls.get_prefix(uri)
200
+ if not prefix:
201
+ raise ValueError(f"Not a GCS URI. Must start with prefix: {cls.PREFIX}")
202
+ return uri[len(prefix) :]
203
+
204
+ @classmethod
205
+ def get_bucket_name(cls, uri: str) -> str:
206
+ if not cls.is_uri(uri):
207
+ raise ValueError(f"Not a GCS URI. Must start with prefix: {cls.PREFIX}")
208
+ return cls.strip_prefix(uri).split("/")[0]
209
+
210
+ @classmethod
211
+ def get_object_key(cls, uri: str) -> str:
212
+ if not cls.is_uri(uri):
213
+ raise ValueError(f"Not a GCS URI. Must start with prefix: {cls.PREFIX}")
214
+ parts = cls.strip_prefix(uri).split("/", 1)
215
+ if len(parts) < 2:
216
+ return ""
217
+ return parts[1]
218
+
219
+ @classmethod
220
+ def get_object_store_bucket_name(cls, uri: str) -> str:
221
+ """
222
+ Get the bucket name from a GCS URI.
223
+
224
+ Args:
225
+ uri: The URI to get the bucket name from
226
+
227
+ Returns:
228
+ The bucket name
229
+
230
+ Raises:
231
+ ValueError: If the URI is not a GCS URI
232
+ """
233
+ return cls.get_bucket_name(uri)
234
+
235
+
236
+ class ABSObjectStore(ObjectStoreInterface):
237
+ """Implementation of ObjectStoreInterface for Azure Blob Storage."""
238
+
239
+ PREFIX = "abfss://"
240
+ HTTPS_REGEX = re.compile(r"(https?://[a-z0-9]{3,24}\.blob\.core\.windows\.net/)")
241
+
242
+ @classmethod
243
+ def is_uri(cls, uri: str) -> bool:
244
+ return uri.startswith(cls.PREFIX) or bool(cls.HTTPS_REGEX.match(uri))
245
+
246
+ @classmethod
247
+ def get_prefix(cls, uri: str) -> Optional[str]:
248
+ if uri.startswith(cls.PREFIX):
249
+ return cls.PREFIX
250
+
251
+ # Check for HTTPS format
252
+ match = cls.HTTPS_REGEX.match(uri)
253
+ if match:
254
+ return match.group(1)
255
+
256
+ return None
257
+
258
+ @classmethod
259
+ def strip_prefix(cls, uri: str) -> str:
260
+ if uri.startswith(cls.PREFIX):
261
+ return uri[len(cls.PREFIX) :]
262
+
263
+ # Handle HTTPS format
264
+ match = cls.HTTPS_REGEX.match(uri)
265
+ if match:
266
+ return uri[len(match.group(1)) :]
267
+
268
+ raise ValueError(
269
+ f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
270
+ )
271
+
272
+ @classmethod
273
+ def get_bucket_name(cls, uri: str) -> str:
274
+ if not cls.is_uri(uri):
275
+ raise ValueError(
276
+ f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
277
+ )
278
+
279
+ if uri.startswith(cls.PREFIX):
280
+ # abfss://container@account.dfs.core.windows.net/path
281
+ return cls.strip_prefix(uri).split("@")[0]
282
+ else:
283
+ # https://account.blob.core.windows.net/container/path
284
+ return cls.strip_prefix(uri).split("/")[0]
285
+
286
+ @classmethod
287
+ def get_object_key(cls, uri: str) -> str:
288
+ if not cls.is_uri(uri):
289
+ raise ValueError(
290
+ f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
291
+ )
292
+
293
+ if uri.startswith(cls.PREFIX):
294
+ # abfss://container@account.dfs.core.windows.net/path
295
+ parts = cls.strip_prefix(uri).split("@", 1)
296
+ if len(parts) < 2:
297
+ return ""
298
+ account_path = parts[1]
299
+ path_parts = account_path.split("/", 1)
300
+ if len(path_parts) < 2:
301
+ return ""
302
+ return path_parts[1]
303
+ else:
304
+ # https://account.blob.core.windows.net/container/path
305
+ stripped = cls.strip_prefix(uri)
306
+ parts = stripped.split("/", 1)
307
+ if len(parts) < 2:
308
+ return ""
309
+ return parts[1]
310
+
311
+
312
+ # Registry of all object store implementations
313
+ OBJECT_STORE_REGISTRY: Dict[str, Type[ObjectStoreInterface]] = {
314
+ "s3": S3ObjectStore,
315
+ "gcs": GCSObjectStore,
316
+ "abs": ABSObjectStore,
317
+ }
318
+
319
+
320
+ def get_object_store_for_uri(uri: str) -> Optional[Type[ObjectStoreInterface]]:
321
+ """
322
+ Get the appropriate object store implementation for the given URI.
323
+
324
+ Args:
325
+ uri: The URI to get the object store for
326
+
327
+ Returns:
328
+ The object store implementation, or None if no matching implementation is found
329
+ """
330
+ for object_store in OBJECT_STORE_REGISTRY.values():
331
+ if object_store.is_uri(uri):
332
+ return object_store
333
+ return None
334
+
335
+
336
+ def get_object_store_bucket_name(uri: str) -> str:
337
+ """
338
+ Get the bucket name from any supported object store URI.
339
+
340
+ This function acts as a central dispatcher that:
341
+ 1. Identifies the appropriate object store implementation for the URI
342
+ 2. Uses that implementation to extract the bucket name
343
+ 3. Falls back to specific URI format parsing if needed
344
+
345
+ Args:
346
+ uri: The URI to get the bucket name from
347
+
348
+ Returns:
349
+ The bucket name
350
+
351
+ Raises:
352
+ ValueError: If the URI is not supported by any registered object store
353
+ """
354
+ # First try to find the native implementation for this URI
355
+ object_store = get_object_store_for_uri(uri)
356
+ if object_store:
357
+ return object_store.get_bucket_name(uri)
358
+
359
+ # If no native implementation, handle specific URI formats directly
360
+ if uri.startswith("gs://"):
361
+ return uri[5:].split("/")[0]
362
+ elif any(uri.startswith(prefix) for prefix in S3ObjectStore.PREFIXES):
363
+ prefix_length = next(
364
+ len(prefix) for prefix in S3ObjectStore.PREFIXES if uri.startswith(prefix)
365
+ )
366
+ return uri[prefix_length:].split("/")[0]
367
+ elif uri.startswith(ABSObjectStore.PREFIX):
368
+ return uri[len(ABSObjectStore.PREFIX) :].split("@")[0]
369
+ elif ABSObjectStore.HTTPS_REGEX.match(uri):
370
+ # Handle HTTPS Azure Blob Storage URLs
371
+ match = ABSObjectStore.HTTPS_REGEX.match(uri)
372
+ if match:
373
+ stripped = uri[len(match.group(1)) :]
374
+ return stripped.split("/")[0]
375
+
376
+ raise ValueError(f"Unsupported URI format: {uri}")
377
+
378
+
379
+ def get_object_key(uri: str) -> str:
380
+ """
381
+ Get the object key from any supported object store URI.
382
+
383
+ Args:
384
+ uri: The URI to get the object key from
385
+
386
+ Returns:
387
+ The object key
388
+
389
+ Raises:
390
+ ValueError: If the URI is not supported by any registered object store
391
+ """
392
+ object_store = get_object_store_for_uri(uri)
393
+ if object_store:
394
+ return object_store.get_object_key(uri)
395
+
396
+ raise ValueError(f"Unsupported URI format: {uri}")
397
+
398
+
399
+ class ObjectStoreSourceAdapter:
400
+ """
401
+ Adapter for customizing object store source implementations.
402
+
403
+ This class provides a way to customize source implementations for different
404
+ object stores (S3, GCS, etc.) without having to directly modify those classes.
405
+ Instead, adapters register customizations that are applied to the source instance.
406
+ """
407
+
408
+ @staticmethod
409
+ def create_s3_path(bucket_name: str, key: str) -> str:
410
+ """
411
+ Create a default S3 path.
412
+
413
+ Args:
414
+ bucket_name: The bucket name
415
+ key: The object key
416
+
417
+ Returns:
418
+ A properly formatted S3 URI
419
+ """
420
+ return unquote(f"s3://{bucket_name}/{key}")
421
+
422
+ @staticmethod
423
+ def create_gcs_path(bucket_name: str, key: str) -> str:
424
+ """
425
+ Create a default GCS path.
426
+
427
+ Args:
428
+ bucket_name: The bucket name
429
+ key: The object key
430
+
431
+ Returns:
432
+ A properly formatted GCS URI
433
+ """
434
+ return unquote(f"gs://{bucket_name}/{key}")
435
+
436
+ @staticmethod
437
+ def create_abs_path(container_name: str, key: str, account_name: str) -> str:
438
+ """
439
+ Create a default Azure Blob Storage path.
440
+
441
+ Args:
442
+ container_name: The container name
443
+ key: The object key
444
+ account_name: The storage account name
445
+
446
+ Returns:
447
+ A properly formatted ABS URI
448
+ """
449
+ return unquote(
450
+ f"abfss://{container_name}@{account_name}.dfs.core.windows.net/{key}"
451
+ )
452
+
453
+ @staticmethod
454
+ def get_s3_external_url(
455
+ table_data: Any, region: Optional[str] = None
456
+ ) -> Optional[str]:
457
+ """
458
+ Get the AWS S3 console URL for the given table.
459
+
460
+ Args:
461
+ table_data: Table data containing path information
462
+ region: AWS region for the S3 console URL, defaults to us-east-1 if not specified
463
+
464
+ Returns:
465
+ The AWS console URL, or None if not applicable
466
+ """
467
+ if not S3ObjectStore.is_uri(table_data.table_path):
468
+ return None
469
+
470
+ # Get the bucket name and key from the S3 URI
471
+ bucket_name = get_object_store_bucket_name(table_data.table_path)
472
+ key = get_object_key(table_data.table_path)
473
+
474
+ # Use the provided region or default to us-east-1
475
+ aws_region = region or "us-east-1"
476
+
477
+ return f"https://{aws_region}.console.aws.amazon.com/s3/buckets/{bucket_name}?prefix={key}"
478
+
479
+ @staticmethod
480
+ def get_gcs_external_url(table_data: Any) -> Optional[str]:
481
+ """
482
+ Get the GCS console URL for the given table.
483
+
484
+ Args:
485
+ table_data: Table data containing path information
486
+
487
+ Returns:
488
+ The GCS console URL, or None if not applicable
489
+ """
490
+ if not GCSObjectStore.is_uri(table_data.table_path):
491
+ return None
492
+
493
+ # Get the bucket name and key from the GCS URI
494
+ bucket_name = get_object_store_bucket_name(table_data.table_path)
495
+ key = get_object_key(table_data.table_path)
496
+
497
+ # Return the basic GCS console URL
498
+ return f"https://console.cloud.google.com/storage/browser/{bucket_name}/{key}"
499
+
500
+ @staticmethod
501
+ def get_abs_external_url(table_data: Any) -> Optional[str]:
502
+ """
503
+ Get the Azure Storage browser URL for the given table.
504
+
505
+ Args:
506
+ table_data: Table data containing path information
507
+
508
+ Returns:
509
+ The Azure Storage URL, or None if not applicable
510
+ """
511
+ if not ABSObjectStore.is_uri(table_data.table_path):
512
+ return None
513
+
514
+ try:
515
+ if table_data.table_path.startswith("abfss://"):
516
+ # URI format: abfss://container@account.dfs.core.windows.net/path
517
+ path_without_prefix = ABSObjectStore.strip_prefix(table_data.table_path)
518
+ parts = path_without_prefix.split("@", 1)
519
+ if len(parts) < 2:
520
+ return None
521
+
522
+ container_name = parts[0]
523
+ account_parts = parts[1].split("/", 1)
524
+ account_domain = account_parts[0]
525
+ account_name = account_domain.split(".")[0]
526
+ else:
527
+ # Handle HTTPS format: https://account.blob.core.windows.net/container/path
528
+ container_name = ABSObjectStore.get_bucket_name(table_data.table_path)
529
+ if "blob.core.windows.net" in table_data.table_path:
530
+ account_name = table_data.table_path.split("//")[1].split(".")[0]
531
+ else:
532
+ return None
533
+
534
+ # Construct Azure portal URL
535
+ return f"https://portal.azure.com/#blade/Microsoft_Azure_Storage/ContainerMenuBlade/overview/storageAccountId/{account_name}/containerName/{container_name}"
536
+ except Exception:
537
+ # If any parsing error occurs, return None
538
+ return None
539
+
540
+ def __init__(
541
+ self,
542
+ platform: str,
543
+ platform_name: str,
544
+ aws_region: Optional[str] = None,
545
+ azure_storage_account: Optional[str] = None,
546
+ ):
547
+ """
548
+ Initialize the adapter with platform-specific configurations.
549
+
550
+ Args:
551
+ platform: The platform identifier (e.g., "s3", "gcs", "abs")
552
+ platform_name: The human-readable platform name
553
+ aws_region: AWS region for S3 URLs, defaults to us-east-1 if not specified
554
+ azure_storage_account: Azure Storage account name
555
+ """
556
+ self.platform = platform
557
+ self.platform_name = platform_name
558
+ self.aws_region = aws_region
559
+ self.azure_storage_account = azure_storage_account
560
+ self.customizations: Dict[str, Callable[..., Any]] = {}
561
+
562
+ # Register default customizations based on platform
563
+ if platform == "gcs":
564
+ self.register_customization("is_s3_platform", lambda: True)
565
+ self.register_customization("create_s3_path", self.create_gcs_path)
566
+ self.register_customization(
567
+ "get_external_url",
568
+ lambda table_data: self.get_gcs_external_url(table_data),
569
+ )
570
+ # Fix URI mismatch issue in pattern matching
571
+ self.register_customization(
572
+ "_normalize_uri_for_pattern_matching",
573
+ self._normalize_gcs_uri_for_pattern_matching,
574
+ )
575
+ # Fix URI handling in schema extraction - override strip_s3_prefix for GCS
576
+ self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
577
+ elif platform == "s3":
578
+ self.register_customization("is_s3_platform", lambda: True)
579
+ self.register_customization("create_s3_path", self.create_s3_path)
580
+ self.register_customization(
581
+ "get_external_url",
582
+ lambda table_data: self.get_s3_external_url(
583
+ table_data, self.aws_region
584
+ ),
585
+ )
586
+ elif platform == "abs":
587
+ self.register_customization("is_s3_platform", lambda: True)
588
+ # If we have an Azure storage account, create a specialized path creation function
589
+ if self.azure_storage_account:
590
+ storage_account = (
591
+ self.azure_storage_account
592
+ ) # Create a local non-optional variable
593
+ self.register_customization(
594
+ "create_s3_path",
595
+ lambda bucket, key: self.create_abs_path(
596
+ bucket, key, storage_account
597
+ ),
598
+ )
599
+ else:
600
+ # Fall back to a simpler implementation if no account provided
601
+ self.register_customization(
602
+ "create_s3_path", lambda bucket, key: f"abfss://{bucket}@{key}"
603
+ )
604
+ self.register_customization("get_external_url", self.get_abs_external_url)
605
+
606
+ def register_customization(
607
+ self, method_name: str, implementation: Callable[..., Any]
608
+ ) -> None:
609
+ """
610
+ Register a customization for a specific method.
611
+
612
+ Args:
613
+ method_name: The name of the method to customize
614
+ implementation: The implementation to use
615
+ """
616
+ self.customizations[method_name] = implementation
617
+
618
+ def apply_customizations(self, source: Any) -> Any:
619
+ """
620
+ Apply all registered customizations to the source instance.
621
+
622
+ Args:
623
+ source: The source instance to customize
624
+
625
+ Returns:
626
+ The customized source instance
627
+ """
628
+ # Set the platform
629
+ if hasattr(source, "source_config") and hasattr(
630
+ source.source_config, "platform"
631
+ ):
632
+ source.source_config.platform = self.platform
633
+
634
+ # Apply method customizations
635
+ for method_name, implementation in self.customizations.items():
636
+ # For instance methods that use self, we need to bind them to the source
637
+ if (
638
+ hasattr(implementation, "__self__")
639
+ and implementation.__self__ is not None
640
+ ):
641
+ # This is already a bound method, use __get__ to rebind it to the source
642
+ setattr(source, method_name, implementation.__get__(source))
643
+ else:
644
+ # This is a regular function or static/class method
645
+ setattr(source, method_name, implementation)
646
+
647
+ return source
648
+
649
+ # Add a direct method for tests that may call this directly
650
+ def get_external_url(self, table_data: Any) -> Optional[str]:
651
+ """
652
+ Get the external URL for a table based on the platform type.
653
+
654
+ This method routes to the appropriate implementation based on the platform.
655
+
656
+ Args:
657
+ table_data: Table data containing path information
658
+
659
+ Returns:
660
+ An external URL or None if not applicable
661
+ """
662
+ if self.platform == "s3":
663
+ return self.get_s3_external_url(table_data, self.aws_region)
664
+ elif self.platform == "gcs":
665
+ return self.get_gcs_external_url(table_data)
666
+ elif self.platform == "abs":
667
+ return self.get_abs_external_url(table_data)
668
+ return None
669
+
670
+ def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
671
+ """
672
+ Normalize GCS URI for pattern matching.
673
+
674
+ This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
675
+ fixing the URI mismatch issue in GCS ingestion.
676
+
677
+ Args:
678
+ uri: The URI to normalize
679
+
680
+ Returns:
681
+ The normalized URI for pattern matching
682
+ """
683
+ if uri.startswith("gs://"):
684
+ return uri.replace("gs://", "s3://", 1)
685
+ return uri
686
+
687
+ def _strip_gcs_prefix(self, uri: str) -> str:
688
+ """
689
+ Strip GCS prefix from URI.
690
+
691
+ This method removes the gs:// prefix from GCS URIs for path processing.
692
+
693
+ Args:
694
+ uri: The URI to strip the prefix from
695
+
696
+ Returns:
697
+ The URI without the gs:// prefix
698
+ """
699
+ if uri.startswith("gs://"):
700
+ return uri[5:] # Remove "gs://" prefix
701
+ return uri
702
+
703
+
704
+ # Factory function to create an adapter for a specific platform
705
+ def create_object_store_adapter(
706
+ platform: str,
707
+ aws_region: Optional[str] = None,
708
+ azure_storage_account: Optional[str] = None,
709
+ ) -> ObjectStoreSourceAdapter:
710
+ """
711
+ Create an adapter for a specific object store platform.
712
+
713
+ Args:
714
+ platform: The platform identifier (e.g., "s3", "gcs", "abs")
715
+ aws_region: AWS region for S3 URLs, defaults to us-east-1 if not specified
716
+ azure_storage_account: Azure Storage account name
717
+
718
+ Returns:
719
+ An adapter configured for the specified platform
720
+ """
721
+ platform_names = {
722
+ "s3": "Amazon S3",
723
+ "gcs": "Google Cloud Storage",
724
+ "abs": "Azure Blob Storage",
725
+ }
726
+
727
+ return ObjectStoreSourceAdapter(
728
+ platform=platform,
729
+ platform_name=platform_names.get(platform, f"Unknown ({platform})"),
730
+ aws_region=aws_region,
731
+ azure_storage_account=azure_storage_account,
732
+ )