acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -3,15 +3,14 @@ import functools
3
3
  import logging
4
4
  import os
5
5
  import pathlib
6
+ import posixpath
6
7
  import re
7
8
  import time
8
9
  from datetime import datetime
9
10
  from pathlib import PurePath
10
- from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple
11
- from urllib.parse import urlparse
11
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
12
12
 
13
13
  import smart_open.compression as so_compression
14
- from more_itertools import peekable
15
14
  from pyspark.conf import SparkConf
16
15
  from pyspark.sql import SparkSession
17
16
  from pyspark.sql.dataframe import DataFrame
@@ -35,14 +34,25 @@ from datahub.ingestion.api.decorators import (
35
34
  )
36
35
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
37
36
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
- from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags, list_folders
37
+ from datahub.ingestion.source.aws.s3_boto_utils import (
38
+ get_s3_tags,
39
+ list_folders_path,
40
+ list_objects_recursive_path,
41
+ )
39
42
  from datahub.ingestion.source.aws.s3_util import (
40
43
  get_bucket_name,
41
44
  get_bucket_relative_path,
42
45
  get_key_prefix,
43
46
  strip_s3_prefix,
44
47
  )
45
- from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
48
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
49
+ from datahub.ingestion.source.data_lake_common.data_lake_utils import (
50
+ ContainerWUCreator,
51
+ add_partition_columns_to_schema,
52
+ )
53
+ from datahub.ingestion.source.data_lake_common.object_store import (
54
+ create_object_store_adapter,
55
+ )
46
56
  from datahub.ingestion.source.data_lake_common.path_spec import FolderTraversalMethod
47
57
  from datahub.ingestion.source.s3.config import DataLakeSourceConfig, PathSpec
48
58
  from datahub.ingestion.source.s3.report import DataLakeSourceReport
@@ -56,9 +66,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
56
66
  )
57
67
  from datahub.metadata.com.linkedin.pegasus2avro.common import TimeStamp
58
68
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
59
- SchemaField,
60
69
  SchemaMetadata,
61
- StringTypeClass,
62
70
  )
63
71
  from datahub.metadata.schema_classes import (
64
72
  DataPlatformInstanceClass,
@@ -68,22 +76,15 @@ from datahub.metadata.schema_classes import (
68
76
  OtherSchemaClass,
69
77
  PartitionsSummaryClass,
70
78
  PartitionSummaryClass,
71
- SchemaFieldDataTypeClass,
72
79
  _Aspect,
73
80
  )
74
81
  from datahub.telemetry import stats, telemetry
75
- from datahub.utilities.groupby import groupby_unsorted
76
82
  from datahub.utilities.perf_timer import PerfTimer
77
83
 
78
- if TYPE_CHECKING:
79
- from mypy_boto3_s3.service_resource import Bucket
80
-
81
84
  # hide annoying debug errors from py4j
82
85
  logging.getLogger("py4j").setLevel(logging.ERROR)
83
86
  logger: logging.Logger = logging.getLogger(__name__)
84
87
 
85
- PAGE_SIZE = 1000
86
-
87
88
  # Hack to support the .gzip extension with smart_open.
88
89
  so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"])
89
90
 
@@ -109,14 +110,7 @@ profiling_flags_to_report = [
109
110
  "include_field_sample_values",
110
111
  ]
111
112
 
112
-
113
- # LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG = AddDatasetBrowsePathConfig(
114
- # path_templates=["/ENV/PLATFORMDATASET_PARTS"], replace_existing=True
115
- # )
116
- #
117
- # LOCAL_BROWSE_PATH_TRANSFORMER = AddDatasetBrowsePathTransformer(
118
- # ctx=None, config=LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG
119
- # )
113
+ URI_SCHEME_REGEX = re.compile(r"^[a-z0-9]+://")
120
114
 
121
115
 
122
116
  def partitioned_folder_comparator(folder1: str, folder2: str) -> int:
@@ -159,6 +153,15 @@ class Folder:
159
153
  )
160
154
 
161
155
 
156
+ @dataclasses.dataclass
157
+ class FolderInfo:
158
+ objects: List[Any]
159
+ total_size: int
160
+ min_time: datetime
161
+ max_time: datetime
162
+ latest_obj: Any
163
+
164
+
162
165
  @dataclasses.dataclass
163
166
  class BrowsePath:
164
167
  file: str
@@ -185,8 +188,15 @@ class TableData:
185
188
 
186
189
  @platform_name("S3 / Local Files", id="s3")
187
190
  @config_class(DataLakeSourceConfig)
188
- @support_status(SupportStatus.INCUBATING)
189
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
191
+ @support_status(SupportStatus.CERTIFIED)
192
+ @capability(
193
+ SourceCapability.CONTAINERS,
194
+ "Enabled by default",
195
+ subtype_modifier=[
196
+ SourceCapabilityModifier.FOLDER,
197
+ SourceCapabilityModifier.S3_BUCKET,
198
+ ],
199
+ )
190
200
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
191
201
  @capability(
192
202
  SourceCapability.SCHEMA_METADATA, "Can infer schema from supported file types"
@@ -197,12 +207,59 @@ class S3Source(StatefulIngestionSourceBase):
197
207
  report: DataLakeSourceReport
198
208
  profiling_times_taken: List[float]
199
209
  container_WU_creator: ContainerWUCreator
210
+ object_store_adapter: Any
200
211
 
201
212
  def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext):
202
213
  super().__init__(config, ctx)
203
214
  self.source_config = config
204
215
  self.report = DataLakeSourceReport()
205
216
  self.profiling_times_taken = []
217
+ self.container_WU_creator = ContainerWUCreator(
218
+ self.source_config.platform,
219
+ self.source_config.platform_instance,
220
+ self.source_config.env,
221
+ )
222
+
223
+ # Create an object store adapter for handling external URLs and paths
224
+ if self.is_s3_platform():
225
+ # Get the AWS region from config, if available
226
+ aws_region = None
227
+ if self.source_config.aws_config:
228
+ aws_region = self.source_config.aws_config.aws_region
229
+
230
+ # For backward compatibility with tests: if we're using a test endpoint, use us-east-1
231
+ if self.source_config.aws_config.aws_endpoint_url and (
232
+ "localstack"
233
+ in self.source_config.aws_config.aws_endpoint_url.lower()
234
+ or "storage.googleapis.com"
235
+ in self.source_config.aws_config.aws_endpoint_url.lower()
236
+ ):
237
+ aws_region = "us-east-1"
238
+
239
+ # Create an S3 adapter with the configured region
240
+ self.object_store_adapter = create_object_store_adapter(
241
+ "s3", aws_region=aws_region
242
+ )
243
+
244
+ # Special handling for GCS via S3 (via boto compatibility layer)
245
+ if (
246
+ self.source_config.aws_config
247
+ and self.source_config.aws_config.aws_endpoint_url
248
+ and "storage.googleapis.com"
249
+ in self.source_config.aws_config.aws_endpoint_url.lower()
250
+ ):
251
+ # We need to preserve the S3-style paths but use GCS external URL generation
252
+ self.object_store_adapter = create_object_store_adapter("gcs")
253
+ # Override create_s3_path to maintain S3 compatibility
254
+ self.object_store_adapter.register_customization(
255
+ "create_s3_path", lambda bucket, key: f"s3://{bucket}/{key}"
256
+ )
257
+ else:
258
+ # For local files, create a default adapter
259
+ self.object_store_adapter = create_object_store_adapter(
260
+ self.source_config.platform or "file"
261
+ )
262
+
206
263
  config_report = {
207
264
  config_option: config.dict().get(config_option)
208
265
  for config_option in config_options_to_report
@@ -319,7 +376,10 @@ class S3Source(StatefulIngestionSourceBase):
319
376
 
320
377
  def read_file_spark(self, file: str, ext: str) -> Optional[DataFrame]:
321
378
  logger.debug(f"Opening file {file} for profiling in spark")
322
- file = file.replace("s3://", "s3a://")
379
+ if "s3://" in file:
380
+ # replace s3:// with s3a://, and make sure standalone bucket names always end with a slash.
381
+ # Spark will fail if given a path like `s3a://mybucket`, and requires it to be `s3a://mybucket/`.
382
+ file = f"s3a://{get_bucket_name(file)}/{get_bucket_relative_path(file)}"
323
383
 
324
384
  telemetry.telemetry_instance.ping("data_lake_file", {"extension": ext})
325
385
 
@@ -376,9 +436,8 @@ class S3Source(StatefulIngestionSourceBase):
376
436
  self.source_config.verify_ssl
377
437
  )
378
438
 
379
- file = smart_open(
380
- table_data.full_path, "rb", transport_params={"client": s3_client}
381
- )
439
+ path = re.sub(URI_SCHEME_REGEX, "s3://", table_data.full_path)
440
+ file = smart_open(path, "rb", transport_params={"client": s3_client})
382
441
  else:
383
442
  # We still use smart_open here to take advantage of the compression
384
443
  # capabilities of smart_open.
@@ -417,7 +476,7 @@ class S3Source(StatefulIngestionSourceBase):
417
476
  fields = sorted(fields, key=lambda f: f.fieldPath)
418
477
 
419
478
  if self.source_config.add_partition_columns_to_schema and table_data.partitions:
420
- self.add_partition_columns_to_schema(
479
+ add_partition_columns_to_schema(
421
480
  fields=fields, path_spec=path_spec, full_path=table_data.full_path
422
481
  )
423
482
 
@@ -453,34 +512,6 @@ class S3Source(StatefulIngestionSourceBase):
453
512
  else:
454
513
  return None
455
514
 
456
- def add_partition_columns_to_schema(
457
- self, path_spec: PathSpec, full_path: str, fields: List[SchemaField]
458
- ) -> None:
459
- is_fieldpath_v2 = False
460
- for field in fields:
461
- if field.fieldPath.startswith("[version=2.0]"):
462
- is_fieldpath_v2 = True
463
- break
464
- partition_keys = path_spec.get_partition_from_path(full_path)
465
- if not partition_keys:
466
- return None
467
-
468
- for partition_key in partition_keys:
469
- fields.append(
470
- SchemaField(
471
- fieldPath=(
472
- f"{partition_key[0]}"
473
- if not is_fieldpath_v2
474
- else f"[version=2.0].[type=string].{partition_key[0]}"
475
- ),
476
- nativeDataType="string",
477
- type=SchemaFieldDataTypeClass(StringTypeClass()),
478
- isPartitioningKey=True,
479
- nullable=True,
480
- recursive=False,
481
- )
482
- )
483
-
484
515
  def get_table_profile(
485
516
  self, table_data: TableData, dataset_urn: str
486
517
  ) -> Iterable[MetadataWorkUnit]:
@@ -605,17 +636,28 @@ class S3Source(StatefulIngestionSourceBase):
605
636
  maxPartition=max_partition_summary, minPartition=min_partition_summary
606
637
  )
607
638
 
639
+ def get_external_url(self, table_data: TableData) -> Optional[str]:
640
+ """
641
+ Get the external URL for a table using the configured object store adapter.
642
+
643
+ Args:
644
+ table_data: Table data containing path information
645
+
646
+ Returns:
647
+ An external URL or None if not applicable
648
+ """
649
+ # The adapter handles all the URL generation with proper region handling
650
+ return self.object_store_adapter.get_external_url(table_data)
651
+
608
652
  def ingest_table(
609
653
  self, table_data: TableData, path_spec: PathSpec
610
654
  ) -> Iterable[MetadataWorkUnit]:
611
655
  aspects: List[Optional[_Aspect]] = []
612
656
 
613
657
  logger.info(f"Extracting table schema from file: {table_data.full_path}")
614
- browse_path: str = (
615
- strip_s3_prefix(table_data.table_path)
616
- if self.is_s3_platform()
617
- else table_data.table_path.strip("/")
618
- )
658
+
659
+ # remove protocol and any leading or trailing slashes
660
+ browse_path = re.sub(URI_SCHEME_REGEX, "", table_data.table_path).strip("/")
619
661
 
620
662
  data_platform_urn = make_data_platform_urn(self.source_config.platform)
621
663
  logger.info(f"Creating dataset urn with name: {browse_path}")
@@ -674,6 +716,7 @@ class S3Source(StatefulIngestionSourceBase):
674
716
  if max_partition
675
717
  else None
676
718
  ),
719
+ externalUrl=self.get_external_url(table_data),
677
720
  )
678
721
  aspects.append(dataset_properties)
679
722
  if table_data.size_in_bytes > 0:
@@ -748,10 +791,20 @@ class S3Source(StatefulIngestionSourceBase):
748
791
  else:
749
792
  return relative_path
750
793
 
751
- def extract_table_name(self, path_spec: PathSpec, named_vars: dict) -> str:
752
- if path_spec.table_name is None:
753
- raise ValueError("path_spec.table_name is not set")
754
- return path_spec.table_name.format_map(named_vars)
794
+ def extract_table_name_and_path(
795
+ self, path_spec: PathSpec, path: str
796
+ ) -> Tuple[str, str]:
797
+ # Extract the table name and base path from a path that's been normalized back to the
798
+ # "s3://" scheme that matches the path_spec
799
+ table_name, table_path = path_spec.extract_table_name_and_path(
800
+ self._normalize_uri_for_pattern_matching(path)
801
+ )
802
+ # Then convert the table base path back to the original scheme
803
+ scheme = re.match(URI_SCHEME_REGEX, path)
804
+ if scheme:
805
+ table_path = re.sub(URI_SCHEME_REGEX, scheme[0], table_path)
806
+
807
+ return table_name, table_path
755
808
 
756
809
  def extract_table_data(
757
810
  self,
@@ -761,7 +814,7 @@ class S3Source(StatefulIngestionSourceBase):
761
814
  path = browse_path.file
762
815
  partitions = browse_path.partitions
763
816
  logger.debug(f"Getting table data for path: {path}")
764
- table_name, table_path = path_spec.extract_table_name_and_path(path)
817
+ table_name, table_path = self.extract_table_name_and_path(path_spec, path)
765
818
  return TableData(
766
819
  display_name=table_name,
767
820
  is_s3=self.is_s3_platform(),
@@ -785,72 +838,91 @@ class S3Source(StatefulIngestionSourceBase):
785
838
  content_type=browse_path.content_type,
786
839
  )
787
840
 
788
- def resolve_templated_folders(self, bucket_name: str, prefix: str) -> Iterable[str]:
841
+ def resolve_templated_folders(self, prefix: str) -> Iterable[str]:
789
842
  folder_split: List[str] = prefix.split("*", 1)
790
843
  # If the len of split is 1 it means we don't have * in the prefix
791
844
  if len(folder_split) == 1:
792
845
  yield prefix
793
846
  return
794
847
 
795
- folders: Iterable[str] = list_folders(
796
- bucket_name, folder_split[0], self.source_config.aws_config
848
+ basename_startswith = folder_split[0].split("/")[-1]
849
+ dirname = folder_split[0].removesuffix(basename_startswith)
850
+
851
+ folders = list_folders_path(
852
+ dirname,
853
+ startswith=basename_startswith,
854
+ aws_config=self.source_config.aws_config,
797
855
  )
798
856
  for folder in folders:
857
+ # Ensure proper path joining - folders from list_folders path never include a
858
+ # trailing slash, but we need to handle the case where folder_split[1] might
859
+ # start with a slash
860
+ remaining_pattern = folder_split[1]
861
+ if remaining_pattern.startswith("/"):
862
+ remaining_pattern = remaining_pattern[1:]
863
+
799
864
  yield from self.resolve_templated_folders(
800
- bucket_name, f"{folder}{folder_split[1]}"
865
+ f"{folder.path}/{remaining_pattern}"
801
866
  )
802
867
 
803
868
  def get_dir_to_process(
804
869
  self,
805
- bucket_name: str,
806
- folder: str,
870
+ uri: str,
807
871
  path_spec: PathSpec,
808
- protocol: str,
809
872
  min: bool = False,
810
873
  ) -> List[str]:
811
- # if len(path_spec.include.split("/")) == len(f"{protocol}{bucket_name}/{folder}".split("/")):
812
- # return [f"{protocol}{bucket_name}/{folder}"]
813
-
814
- iterator = list_folders(
815
- bucket_name=bucket_name,
816
- prefix=folder,
874
+ # Add any remaining parts of the path_spec before globs, excluding the
875
+ # final filename component, to the URI and prefix so that we don't
876
+ # unnecessarily list too many objects.
877
+ if not uri.endswith("/"):
878
+ uri += "/"
879
+ remaining = posixpath.dirname(path_spec.get_remaining_glob_include(uri)).split(
880
+ "*"
881
+ )[0]
882
+ uri += posixpath.dirname(remaining)
883
+ prefix = posixpath.basename(remaining)
884
+
885
+ # Check if we're at the end of the include path. If so, no need to list sub-folders.
886
+ if path_spec.has_correct_number_of_directory_components(uri):
887
+ return [uri]
888
+
889
+ logger.debug(f"get_dir_to_process listing folders {uri=} {prefix=}")
890
+ iterator = list_folders_path(
891
+ s3_uri=uri,
892
+ startswith=prefix,
817
893
  aws_config=self.source_config.aws_config,
818
894
  )
819
- iterator = peekable(iterator)
820
- if iterator:
821
- sorted_dirs = sorted(
822
- iterator,
823
- key=functools.cmp_to_key(partitioned_folder_comparator),
824
- reverse=not min,
825
- )
826
- folders = []
827
- for dir in sorted_dirs:
828
- if path_spec.dir_allowed(f"{protocol}{bucket_name}/{dir}/"):
829
- folders_list = self.get_dir_to_process(
830
- bucket_name=bucket_name,
831
- folder=dir + "/",
832
- path_spec=path_spec,
833
- protocol=protocol,
834
- min=min,
835
- )
836
- folders.extend(folders_list)
837
- if path_spec.traversal_method != FolderTraversalMethod.ALL:
838
- return folders
839
- if folders:
840
- return folders
841
- else:
842
- return [f"{protocol}{bucket_name}/{folder}"]
843
- return [f"{protocol}{bucket_name}/{folder}"]
895
+ sorted_dirs = sorted(
896
+ iterator,
897
+ key=lambda dir: functools.cmp_to_key(partitioned_folder_comparator)(
898
+ dir.name
899
+ ),
900
+ reverse=not min,
901
+ )
902
+ folders = []
903
+ for dir in sorted_dirs:
904
+ if path_spec.dir_allowed(dir.path):
905
+ folders_list = self.get_dir_to_process(
906
+ uri=dir.path,
907
+ path_spec=path_spec,
908
+ min=min,
909
+ )
910
+ folders.extend(folders_list)
911
+ if path_spec.traversal_method != FolderTraversalMethod.ALL:
912
+ return folders
913
+ if folders:
914
+ return folders
915
+ else:
916
+ return [uri]
844
917
 
845
918
  def get_folder_info(
846
919
  self,
847
920
  path_spec: PathSpec,
848
- bucket: "Bucket",
849
- prefix: str,
921
+ uri: str,
850
922
  ) -> Iterable[Folder]:
851
923
  """
852
- Retrieves all the folders in a path by listing all the files in the prefix.
853
- If the prefix is a full path then only that folder will be extracted.
924
+ Retrieves all the folders in a path by recursively listing all the files under the
925
+ given URI.
854
926
 
855
927
  A folder has creation and modification times, size, and a sample file path.
856
928
  - Creation time is the earliest creation time of all files in the folder.
@@ -860,72 +932,174 @@ class S3Source(StatefulIngestionSourceBase):
860
932
 
861
933
  Parameters:
862
934
  path_spec (PathSpec): The path specification used to determine partitioning.
863
- bucket (Bucket): The S3 bucket object.
864
- prefix (str): The prefix path in the S3 bucket to list objects from.
935
+ uri (str): The path in the S3 bucket to list objects from.
865
936
 
866
937
  Returns:
867
938
  List[Folder]: A list of Folder objects representing the partitions found.
868
939
  """
869
940
 
870
941
  def _is_allowed_path(path_spec_: PathSpec, s3_uri: str) -> bool:
871
- allowed = path_spec_.allowed(s3_uri)
942
+ # Normalize URI for pattern matching
943
+ normalized_uri = self._normalize_uri_for_pattern_matching(s3_uri)
944
+
945
+ allowed = path_spec_.allowed(normalized_uri)
872
946
  if not allowed:
873
947
  logger.debug(f"File {s3_uri} not allowed and skipping")
874
948
  self.report.report_file_dropped(s3_uri)
875
949
  return allowed
876
950
 
877
- s3_objects = (
878
- obj
879
- for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
880
- if _is_allowed_path(
881
- path_spec, self.create_s3_path(obj.bucket_name, obj.key)
951
+ # Add any remaining parts of the path_spec before globs to the URI and prefix,
952
+ # so that we don't unnecessarily list too many objects.
953
+ if not uri.endswith("/"):
954
+ uri += "/"
955
+ remaining = path_spec.get_remaining_glob_include(uri).split("*")[0]
956
+ uri += posixpath.dirname(remaining)
957
+ prefix = posixpath.basename(remaining)
958
+
959
+ # Process objects in a memory-efficient streaming fashion
960
+ # Instead of loading all objects into memory, we'll accumulate folder data incrementally
961
+ folder_data: Dict[str, FolderInfo] = {} # dirname -> FolderInfo
962
+
963
+ logger.info(f"Listing objects under {repr(uri)} with {prefix=}")
964
+
965
+ for obj in list_objects_recursive_path(
966
+ uri, startswith=prefix, aws_config=self.source_config.aws_config
967
+ ):
968
+ s3_path = self.create_s3_path(obj.bucket_name, obj.key)
969
+
970
+ if not _is_allowed_path(path_spec, s3_path):
971
+ continue
972
+
973
+ # Extract the directory name (folder) from the object key
974
+ dirname = obj.key.rsplit("/", 1)[0]
975
+
976
+ # Initialize folder data if we haven't seen this directory before
977
+ if dirname not in folder_data:
978
+ folder_data[dirname] = FolderInfo(
979
+ objects=[],
980
+ total_size=0,
981
+ min_time=obj.last_modified,
982
+ max_time=obj.last_modified,
983
+ latest_obj=obj,
984
+ )
985
+
986
+ # Update folder statistics incrementally
987
+ folder_info = folder_data[dirname]
988
+ folder_info.objects.append(obj)
989
+ folder_info.total_size += obj.size
990
+
991
+ # Track min/max times and latest object
992
+ if obj.last_modified < folder_info.min_time:
993
+ folder_info.min_time = obj.last_modified
994
+ if obj.last_modified > folder_info.max_time:
995
+ folder_info.max_time = obj.last_modified
996
+ folder_info.latest_obj = obj
997
+
998
+ # Yield folders after processing all objects
999
+ for _dirname, folder_info in folder_data.items():
1000
+ latest_obj = folder_info.latest_obj
1001
+ max_file_s3_path = self.create_s3_path(
1002
+ latest_obj.bucket_name, latest_obj.key
882
1003
  )
883
- )
884
- grouped_s3_objects_by_dirname = groupby_unsorted(
885
- s3_objects,
886
- key=lambda obj: obj.key.rsplit("/", 1)[0],
887
- )
888
- for _, group in grouped_s3_objects_by_dirname:
889
- max_file = max(group, key=lambda x: x.last_modified)
890
- max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
891
1004
 
892
1005
  # If partition_id is None, it means the folder is not a partition
893
- partition_id = path_spec.get_partition_from_path(max_file_s3_path)
1006
+ partition_id = path_spec.get_partition_from_path(
1007
+ self._normalize_uri_for_pattern_matching(max_file_s3_path)
1008
+ )
894
1009
 
895
1010
  yield Folder(
896
1011
  partition_id=partition_id,
897
1012
  is_partition=bool(partition_id),
898
- creation_time=min(obj.last_modified for obj in group),
899
- modification_time=max_file.last_modified,
1013
+ creation_time=folder_info.min_time,
1014
+ modification_time=folder_info.max_time,
900
1015
  sample_file=max_file_s3_path,
901
- size=sum(obj.size for obj in group),
1016
+ size=folder_info.total_size,
902
1017
  )
903
1018
 
1019
+ def create_s3_path(self, bucket_name: str, key: str) -> str:
1020
+ return f"s3://{bucket_name}/{key}"
1021
+
904
1022
  def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
1023
+ """
1024
+ Main entry point for browsing S3 objects and creating table-level datasets.
1025
+
1026
+ This method determines whether to use templated processing (for paths with {table})
1027
+ or simple file-by-file processing (for paths without templates).
1028
+
1029
+ Args:
1030
+ path_spec: Configuration specifying the S3 path pattern to scan
1031
+ sample_size: Number of files to sample (used in simple processing)
1032
+
1033
+ Returns:
1034
+ Iterator of BrowsePath objects representing datasets to be created
1035
+
1036
+ Examples:
1037
+ - Templated: s3://bucket/data/*/{table}/** -> Groups files by table
1038
+ - Simple: s3://bucket/data/*.csv -> Processes individual files
1039
+ """
1040
+ if self.source_config.aws_config is None:
1041
+ raise ValueError("aws_config not set. Cannot browse s3")
1042
+
1043
+ logger.info(f"Processing path spec: {path_spec.include}")
1044
+
1045
+ # Check if we have {table} template in the path
1046
+ has_table_template = "{table}" in path_spec.include
1047
+
1048
+ logger.info(f"Has table template: {has_table_template}")
1049
+
1050
+ if has_table_template:
1051
+ logger.info("Using templated path processing")
1052
+ # Always use templated processing when {table} is present
1053
+ # This groups files under table-level datasets
1054
+ yield from self._process_templated_path(path_spec)
1055
+ else:
1056
+ logger.info("Using simple path processing")
1057
+ # Only use simple processing for non-templated paths
1058
+ # This creates individual file-level datasets
1059
+ yield from self._process_simple_path(path_spec)
1060
+
1061
+ def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
1062
+ """
1063
+ Process S3 paths containing {table} templates to create table-level datasets.
1064
+
1065
+ This method handles complex path patterns with wildcards and templates by:
1066
+ 1. Replacing template placeholders with stars (except {table})
1067
+ 2. Resolving wildcards in the path up to the {table} marker
1068
+ 3. Finding all potential table folders under each resolved path
1069
+ 4. Applying configurable partition traversal strategy (ALL, MAX, MIN_MAX)
1070
+ 5. Aggregating files from selected partitions under each table
1071
+ 6. Creating one dataset per table (not per file)
1072
+
1073
+ Args:
1074
+ path_spec: Path specification with {table} template
1075
+
1076
+ Yields:
1077
+ BrowsePath: One per table (not per file), containing aggregated metadata
1078
+ """
1079
+
905
1080
  if self.source_config.aws_config is None:
906
1081
  raise ValueError("aws_config not set. Cannot browse s3")
907
1082
  s3 = self.source_config.aws_config.get_s3_resource(
908
1083
  self.source_config.verify_ssl
909
1084
  )
910
- bucket_name = get_bucket_name(path_spec.include)
911
- logger.debug(f"Scanning bucket: {bucket_name}")
912
- bucket = s3.Bucket(bucket_name)
913
- prefix = self.get_prefix(get_bucket_relative_path(path_spec.include))
914
- logger.debug(f"Scanning objects with prefix:{prefix}")
1085
+
1086
+ # Find the part before {table}
1087
+ table_marker = "{table}"
1088
+ if table_marker not in path_spec.include:
1089
+ logger.info("No {table} marker found in path")
1090
+ return
1091
+
1092
+ # STEP 1: Replace template placeholders with stars (except {table}) to enable folder resolution
1093
+ # This is the crucial missing logic from the original implementation
915
1094
  matches = re.finditer(r"{\s*\w+\s*}", path_spec.include, re.MULTILINE)
916
1095
  matches_list = list(matches)
917
- if matches_list and path_spec.sample_files:
918
- # Replace the patch_spec include's templates with star because later we want to resolve all the stars
919
- # to actual directories.
920
- # For example:
921
- # "s3://my-test-bucket/*/{dept}/*/{table}/*/*.*" -> "s3://my-test-bucket/*/*/*/{table}/*/*.*"
922
- # We only keep the last template as a marker to know the point util we need to resolve path.
923
- # After the marker we can safely get sample files for sampling because it is not used in the
924
- # table name, so we don't need all the files.
925
- # This speed up processing but we won't be able to get a precise modification date/size/number of files.
1096
+
1097
+ if matches_list:
1098
+ # Replace all templates with stars except keep {table} as the marker
926
1099
  max_start: int = -1
927
1100
  include: str = path_spec.include
928
1101
  max_match: str = ""
1102
+
929
1103
  for match in matches_list:
930
1104
  pos = include.find(match.group())
931
1105
  if pos > max_start:
@@ -937,109 +1111,198 @@ class S3Source(StatefulIngestionSourceBase):
937
1111
  if max_match == "{table}":
938
1112
  break
939
1113
 
940
- table_index = include.find(max_match)
941
- for folder in self.resolve_templated_folders(
942
- bucket_name, get_bucket_relative_path(include[:table_index])
943
- ):
944
- try:
945
- for f in list_folders(
946
- bucket_name, f"{folder}", self.source_config.aws_config
947
- ):
948
- dirs_to_process = []
949
- logger.info(f"Processing folder: {f}")
950
- if path_spec.traversal_method == FolderTraversalMethod.ALL:
951
- dirs_to_process.append(f)
952
- else:
953
- if (
954
- path_spec.traversal_method
955
- == FolderTraversalMethod.MIN_MAX
956
- or path_spec.traversal_method
957
- == FolderTraversalMethod.MAX
958
- ):
959
- protocol = ContainerWUCreator.get_protocol(
960
- path_spec.include
961
- )
962
- dirs_to_process_max = self.get_dir_to_process(
963
- bucket_name=bucket_name,
964
- folder=f + "/",
965
- path_spec=path_spec,
966
- protocol=protocol,
967
- )
968
- dirs_to_process.append(dirs_to_process_max[0])
969
-
970
- if (
971
- path_spec.traversal_method
972
- == FolderTraversalMethod.MIN_MAX
973
- ):
974
- dirs_to_process_min = self.get_dir_to_process(
975
- bucket_name=bucket_name,
976
- folder=f + "/",
977
- path_spec=path_spec,
978
- protocol=protocol,
979
- min=True,
980
- )
981
- dirs_to_process.append(dirs_to_process_min[0])
982
- folders: List[Folder] = []
983
- for dir in dirs_to_process:
984
- logger.info(f"Getting files from folder: {dir}")
985
- prefix_to_process = urlparse(dir).path.lstrip("/")
986
-
987
- folders.extend(
988
- self.get_folder_info(
989
- path_spec, bucket, prefix_to_process
990
- )
1114
+ logger.info(f"Template replacement: {path_spec.include} -> {include}")
1115
+ else:
1116
+ include = path_spec.include
1117
+
1118
+ # Split the path at {table} to get the prefix that needs wildcard resolution
1119
+ prefix_before_table = include.split(table_marker)[0]
1120
+ logger.info(f"Prefix before table: {prefix_before_table}")
1121
+
1122
+ try:
1123
+ # STEP 2: Resolve ALL wildcards in the path up to {table}
1124
+ # This converts patterns like "s3://data/*/logs/" to actual paths like ["s3://data/2023/logs/", "s3://data/2024/logs/"]
1125
+ resolved_prefixes = list(
1126
+ self.resolve_templated_folders(prefix_before_table)
1127
+ )
1128
+ logger.info(f"Resolved prefixes: {resolved_prefixes}")
1129
+
1130
+ # STEP 3: Process each resolved prefix to find table folders
1131
+ for resolved_prefix in resolved_prefixes:
1132
+ logger.info(f"Processing resolved prefix: {resolved_prefix}")
1133
+
1134
+ # Get all folders that could be tables under this resolved prefix
1135
+ # These are the actual table names (e.g., "users", "events", "logs")
1136
+ table_folders = list(
1137
+ list_folders_path(
1138
+ resolved_prefix, aws_config=self.source_config.aws_config
1139
+ )
1140
+ )
1141
+ logger.debug(
1142
+ f"Found table folders under {resolved_prefix}: {[folder.name for folder in table_folders]}"
1143
+ )
1144
+
1145
+ # STEP 4: Process each table folder to create a table-level dataset
1146
+ for folder in table_folders:
1147
+ logger.info(f"Processing table path: {folder.path}")
1148
+
1149
+ # Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
1150
+ # This uses the compiled regex pattern to extract the table name from the full path
1151
+ table_name, _ = self.extract_table_name_and_path(
1152
+ path_spec, folder.path
1153
+ )
1154
+
1155
+ # Apply table name filtering if configured
1156
+ if not path_spec.tables_filter_pattern.allowed(table_name):
1157
+ logger.debug(f"Table '{table_name}' not allowed and skipping")
1158
+ continue
1159
+
1160
+ # STEP 5: Handle partition traversal based on configuration
1161
+ dirs_to_process = []
1162
+
1163
+ if path_spec.traversal_method == FolderTraversalMethod.ALL:
1164
+ # Process ALL partitions (original behavior)
1165
+ dirs_to_process = [folder.path]
1166
+ logger.debug(
1167
+ f"Processing ALL partition folders under: {folder.path}"
1168
+ )
1169
+
1170
+ else:
1171
+ # Use the original get_dir_to_process logic for MIN/MAX
1172
+ if (
1173
+ path_spec.traversal_method == FolderTraversalMethod.MIN_MAX
1174
+ or path_spec.traversal_method == FolderTraversalMethod.MAX
1175
+ ):
1176
+ # Get MAX partition using original logic
1177
+ dirs_to_process_max = self.get_dir_to_process(
1178
+ uri=folder.path,
1179
+ path_spec=path_spec,
1180
+ min=False,
991
1181
  )
992
- max_folder = None
993
- if folders:
994
- max_folder = max(folders, key=lambda x: x.modification_time)
995
- if not max_folder:
996
- logger.warning(
997
- f"Unable to find any files in the folder {dir}. Skipping..."
1182
+ if dirs_to_process_max:
1183
+ dirs_to_process.extend(dirs_to_process_max)
1184
+ logger.debug(
1185
+ f"Added MAX partition: {dirs_to_process_max}"
1186
+ )
1187
+
1188
+ if path_spec.traversal_method == FolderTraversalMethod.MIN_MAX:
1189
+ # Get MIN partition using original logic
1190
+ dirs_to_process_min = self.get_dir_to_process(
1191
+ uri=folder.path,
1192
+ path_spec=path_spec,
1193
+ min=True,
998
1194
  )
999
- continue
1195
+ if dirs_to_process_min:
1196
+ dirs_to_process.extend(dirs_to_process_min)
1197
+ logger.debug(
1198
+ f"Added MIN partition: {dirs_to_process_min}"
1199
+ )
1000
1200
 
1001
- partitions = list(filter(lambda x: x.is_partition, folders))
1002
- yield BrowsePath(
1003
- file=max_folder.sample_file,
1004
- timestamp=max_folder.modification_time,
1005
- size=max_folder.size,
1006
- partitions=partitions,
1007
- # TODO: Support content type inference for partitions
1201
+ # Process the selected partitions
1202
+ all_folders = []
1203
+ for partition_path in dirs_to_process:
1204
+ logger.info(f"Scanning files in partition: {partition_path}")
1205
+ partition_files = list(
1206
+ self.get_folder_info(path_spec, partition_path)
1008
1207
  )
1009
- except Exception as e:
1010
- # This odd check if being done because boto does not have a proper exception to catch
1011
- # The exception that appears in stacktrace cannot actually be caught without a lot more work
1012
- # https://github.com/boto/boto3/issues/1195
1013
- if "NoSuchBucket" in repr(e):
1014
- logger.debug(f"Got NoSuchBucket exception for {bucket_name}", e)
1015
- self.get_report().report_warning(
1016
- "Missing bucket", f"No bucket found {bucket_name}"
1208
+ all_folders.extend(partition_files)
1209
+
1210
+ if all_folders:
1211
+ # Use the most recent file across all processed partitions
1212
+ latest_file = max(
1213
+ all_folders, key=lambda x: x.modification_time
1214
+ )
1215
+
1216
+ # Get partition information
1217
+ partitions = [f for f in all_folders if f.is_partition]
1218
+
1219
+ # Calculate total size of processed partitions
1220
+ total_size = sum(f.size for f in all_folders)
1221
+
1222
+ # Create ONE BrowsePath per table
1223
+ # The key insight: we need to provide the sample file for schema inference
1224
+ # but the table path should be extracted correctly by extract_table_name_and_path
1225
+ yield BrowsePath(
1226
+ file=latest_file.sample_file, # Sample file for schema inference
1227
+ timestamp=latest_file.modification_time, # Latest timestamp
1228
+ size=total_size, # Size of processed partitions
1229
+ partitions=partitions, # Partition metadata
1017
1230
  )
1018
1231
  else:
1019
- raise e
1020
- else:
1021
- logger.debug(
1022
- "No template in the pathspec can't do sampling, fallbacking to do full scan"
1023
- )
1024
- path_spec.sample_files = False
1025
- for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
1026
- s3_path = self.create_s3_path(obj.bucket_name, obj.key)
1027
- logger.debug(f"Path: {s3_path}")
1028
-
1029
- content_type = None
1030
- if self.source_config.use_s3_content_type:
1031
- content_type = s3.Object(obj.bucket_name, obj.key).content_type
1032
-
1033
- yield BrowsePath(
1034
- file=s3_path,
1035
- timestamp=obj.last_modified,
1036
- size=obj.size,
1037
- partitions=[],
1038
- content_type=content_type,
1232
+ logger.warning(
1233
+ f"No files found in processed partitions for table {table_name}"
1234
+ )
1235
+
1236
+ except Exception as e:
1237
+ if isinstance(e, s3.meta.client.exceptions.NoSuchBucket):
1238
+ self.get_report().report_warning(
1239
+ "Missing bucket",
1240
+ f"No bucket found {e.response['Error'].get('BucketName')}",
1039
1241
  )
1242
+ return
1243
+ logger.error(f"Error in _process_templated_path: {e}")
1244
+ raise e
1040
1245
 
1041
- def create_s3_path(self, bucket_name: str, key: str) -> str:
1042
- return f"s3://{bucket_name}/{key}"
1246
+ def _process_simple_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
1247
+ """
1248
+ Process simple S3 paths without {table} templates to create file-level datasets.
1249
+
1250
+ This method handles straightforward file patterns by:
1251
+ 1. Listing all files matching the pattern
1252
+ 2. Creating one dataset per file
1253
+ 3. No aggregation or grouping is performed
1254
+
1255
+ Use Cases:
1256
+ - Individual file processing: s3://bucket/data/*.csv
1257
+ - Direct file paths: s3://bucket/data/myfile.json
1258
+ - Patterns without table grouping: s3://bucket/logs/*.log
1259
+
1260
+ Args:
1261
+ path_spec: Path specification without {table} template
1262
+
1263
+ Yields:
1264
+ BrowsePath: One per file, containing individual file metadata
1265
+
1266
+ Example Output:
1267
+ - BrowsePath(file="data/file1.csv", size=1000, partitions=[])
1268
+ - BrowsePath(file="data/file2.csv", size=2000, partitions=[])
1269
+ """
1270
+
1271
+ if self.source_config.aws_config is None:
1272
+ raise ValueError("aws_config not set")
1273
+ s3 = self.source_config.aws_config.get_s3_resource(
1274
+ self.source_config.verify_ssl
1275
+ )
1276
+
1277
+ path_spec.sample_files = False # Disable sampling for simple paths
1278
+
1279
+ # Extract the prefix from the path spec (stops at first wildcard)
1280
+ prefix = self.get_prefix(path_spec.include)
1281
+
1282
+ basename_startswith = prefix.split("/")[-1]
1283
+ dirname = prefix.removesuffix(basename_startswith)
1284
+
1285
+ # Iterate through all objects in the bucket matching the prefix
1286
+ for obj in list_objects_recursive_path(
1287
+ dirname,
1288
+ startswith=basename_startswith,
1289
+ aws_config=self.source_config.aws_config,
1290
+ ):
1291
+ s3_path = self.create_s3_path(obj.bucket_name, obj.key)
1292
+
1293
+ # Get content type if configured
1294
+ content_type = None
1295
+ if self.source_config.use_s3_content_type:
1296
+ content_type = s3.Object(obj.bucket_name, obj.key).content_type
1297
+
1298
+ # Create one BrowsePath per file
1299
+ yield BrowsePath(
1300
+ file=s3_path,
1301
+ timestamp=obj.last_modified,
1302
+ size=obj.size,
1303
+ partitions=[], # No partitions in simple mode
1304
+ content_type=content_type,
1305
+ )
1043
1306
 
1044
1307
  def local_browser(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
1045
1308
  prefix = self.get_prefix(path_spec.include)
@@ -1071,11 +1334,6 @@ class S3Source(StatefulIngestionSourceBase):
1071
1334
  )
1072
1335
 
1073
1336
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
1074
- self.container_WU_creator = ContainerWUCreator(
1075
- self.source_config.platform,
1076
- self.source_config.platform_instance,
1077
- self.source_config.env,
1078
- )
1079
1337
  with PerfTimer() as timer:
1080
1338
  assert self.source_config.path_specs
1081
1339
  for path_spec in self.source_config.path_specs:
@@ -1088,8 +1346,13 @@ class S3Source(StatefulIngestionSourceBase):
1088
1346
  )
1089
1347
  table_dict: Dict[str, TableData] = {}
1090
1348
  for browse_path in file_browser:
1349
+ # Normalize URI for pattern matching
1350
+ normalized_file_path = self._normalize_uri_for_pattern_matching(
1351
+ browse_path.file
1352
+ )
1353
+
1091
1354
  if not path_spec.allowed(
1092
- browse_path.file,
1355
+ normalized_file_path,
1093
1356
  ignore_ext=self.is_s3_platform()
1094
1357
  and self.source_config.use_s3_content_type,
1095
1358
  ):
@@ -1165,5 +1428,13 @@ class S3Source(StatefulIngestionSourceBase):
1165
1428
  def is_s3_platform(self):
1166
1429
  return self.source_config.platform == "s3"
1167
1430
 
1431
+ def strip_s3_prefix(self, s3_uri: str) -> str:
1432
+ """Strip S3 prefix from URI. Can be overridden by adapters for other platforms."""
1433
+ return strip_s3_prefix(s3_uri)
1434
+
1435
+ def _normalize_uri_for_pattern_matching(self, uri: str) -> str:
1436
+ """Normalize URI for pattern matching. Can be overridden by adapters for other platforms."""
1437
+ return uri
1438
+
1168
1439
  def get_report(self):
1169
1440
  return self.report