acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -3,15 +3,14 @@ import functools
3
3
  import logging
4
4
  import os
5
5
  import pathlib
6
+ import posixpath
6
7
  import re
7
8
  import time
8
9
  from datetime import datetime
9
10
  from pathlib import PurePath
10
- from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
11
- from urllib.parse import urlparse
11
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
12
12
 
13
13
  import smart_open.compression as so_compression
14
- from more_itertools import peekable
15
14
  from pyspark.conf import SparkConf
16
15
  from pyspark.sql import SparkSession
17
16
  from pyspark.sql.dataframe import DataFrame
@@ -35,14 +34,22 @@ from datahub.ingestion.api.decorators import (
35
34
  )
36
35
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
37
36
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
- from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags, list_folders
37
+ from datahub.ingestion.source.aws.s3_boto_utils import (
38
+ get_s3_tags,
39
+ list_folders_path,
40
+ list_objects_recursive_path,
41
+ )
39
42
  from datahub.ingestion.source.aws.s3_util import (
40
43
  get_bucket_name,
41
44
  get_bucket_relative_path,
42
45
  get_key_prefix,
43
46
  strip_s3_prefix,
44
47
  )
45
- from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
48
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
49
+ from datahub.ingestion.source.data_lake_common.data_lake_utils import (
50
+ ContainerWUCreator,
51
+ add_partition_columns_to_schema,
52
+ )
46
53
  from datahub.ingestion.source.data_lake_common.object_store import (
47
54
  create_object_store_adapter,
48
55
  )
@@ -59,9 +66,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
59
66
  )
60
67
  from datahub.metadata.com.linkedin.pegasus2avro.common import TimeStamp
61
68
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
62
- SchemaField,
63
69
  SchemaMetadata,
64
- StringTypeClass,
65
70
  )
66
71
  from datahub.metadata.schema_classes import (
67
72
  DataPlatformInstanceClass,
@@ -71,22 +76,15 @@ from datahub.metadata.schema_classes import (
71
76
  OtherSchemaClass,
72
77
  PartitionsSummaryClass,
73
78
  PartitionSummaryClass,
74
- SchemaFieldDataTypeClass,
75
79
  _Aspect,
76
80
  )
77
81
  from datahub.telemetry import stats, telemetry
78
- from datahub.utilities.groupby import groupby_unsorted
79
82
  from datahub.utilities.perf_timer import PerfTimer
80
83
 
81
- if TYPE_CHECKING:
82
- from mypy_boto3_s3.service_resource import Bucket
83
-
84
84
  # hide annoying debug errors from py4j
85
85
  logging.getLogger("py4j").setLevel(logging.ERROR)
86
86
  logger: logging.Logger = logging.getLogger(__name__)
87
87
 
88
- PAGE_SIZE = 1000
89
-
90
88
  # Hack to support the .gzip extension with smart_open.
91
89
  so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"])
92
90
 
@@ -112,14 +110,7 @@ profiling_flags_to_report = [
112
110
  "include_field_sample_values",
113
111
  ]
114
112
 
115
-
116
- # LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG = AddDatasetBrowsePathConfig(
117
- # path_templates=["/ENV/PLATFORMDATASET_PARTS"], replace_existing=True
118
- # )
119
- #
120
- # LOCAL_BROWSE_PATH_TRANSFORMER = AddDatasetBrowsePathTransformer(
121
- # ctx=None, config=LOCAL_BROWSE_PATH_TRANSFORMER_CONFIG
122
- # )
113
+ URI_SCHEME_REGEX = re.compile(r"^[a-z0-9]+://")
123
114
 
124
115
 
125
116
  def partitioned_folder_comparator(folder1: str, folder2: str) -> int:
@@ -162,6 +153,15 @@ class Folder:
162
153
  )
163
154
 
164
155
 
156
+ @dataclasses.dataclass
157
+ class FolderInfo:
158
+ objects: List[Any]
159
+ total_size: int
160
+ min_time: datetime
161
+ max_time: datetime
162
+ latest_obj: Any
163
+
164
+
165
165
  @dataclasses.dataclass
166
166
  class BrowsePath:
167
167
  file: str
@@ -188,8 +188,15 @@ class TableData:
188
188
 
189
189
  @platform_name("S3 / Local Files", id="s3")
190
190
  @config_class(DataLakeSourceConfig)
191
- @support_status(SupportStatus.INCUBATING)
192
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
191
+ @support_status(SupportStatus.CERTIFIED)
192
+ @capability(
193
+ SourceCapability.CONTAINERS,
194
+ "Enabled by default",
195
+ subtype_modifier=[
196
+ SourceCapabilityModifier.FOLDER,
197
+ SourceCapabilityModifier.S3_BUCKET,
198
+ ],
199
+ )
193
200
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
194
201
  @capability(
195
202
  SourceCapability.SCHEMA_METADATA, "Can infer schema from supported file types"
@@ -369,7 +376,10 @@ class S3Source(StatefulIngestionSourceBase):
369
376
 
370
377
  def read_file_spark(self, file: str, ext: str) -> Optional[DataFrame]:
371
378
  logger.debug(f"Opening file {file} for profiling in spark")
372
- file = file.replace("s3://", "s3a://")
379
+ if "s3://" in file:
380
+ # replace s3:// with s3a://, and make sure standalone bucket names always end with a slash.
381
+ # Spark will fail if given a path like `s3a://mybucket`, and requires it to be `s3a://mybucket/`.
382
+ file = f"s3a://{get_bucket_name(file)}/{get_bucket_relative_path(file)}"
373
383
 
374
384
  telemetry.telemetry_instance.ping("data_lake_file", {"extension": ext})
375
385
 
@@ -426,9 +436,8 @@ class S3Source(StatefulIngestionSourceBase):
426
436
  self.source_config.verify_ssl
427
437
  )
428
438
 
429
- file = smart_open(
430
- table_data.full_path, "rb", transport_params={"client": s3_client}
431
- )
439
+ path = re.sub(URI_SCHEME_REGEX, "s3://", table_data.full_path)
440
+ file = smart_open(path, "rb", transport_params={"client": s3_client})
432
441
  else:
433
442
  # We still use smart_open here to take advantage of the compression
434
443
  # capabilities of smart_open.
@@ -467,7 +476,7 @@ class S3Source(StatefulIngestionSourceBase):
467
476
  fields = sorted(fields, key=lambda f: f.fieldPath)
468
477
 
469
478
  if self.source_config.add_partition_columns_to_schema and table_data.partitions:
470
- self.add_partition_columns_to_schema(
479
+ add_partition_columns_to_schema(
471
480
  fields=fields, path_spec=path_spec, full_path=table_data.full_path
472
481
  )
473
482
 
@@ -503,34 +512,6 @@ class S3Source(StatefulIngestionSourceBase):
503
512
  else:
504
513
  return None
505
514
 
506
- def add_partition_columns_to_schema(
507
- self, path_spec: PathSpec, full_path: str, fields: List[SchemaField]
508
- ) -> None:
509
- is_fieldpath_v2 = False
510
- for field in fields:
511
- if field.fieldPath.startswith("[version=2.0]"):
512
- is_fieldpath_v2 = True
513
- break
514
- partition_keys = path_spec.get_partition_from_path(full_path)
515
- if not partition_keys:
516
- return None
517
-
518
- for partition_key in partition_keys:
519
- fields.append(
520
- SchemaField(
521
- fieldPath=(
522
- f"{partition_key[0]}"
523
- if not is_fieldpath_v2
524
- else f"[version=2.0].[type=string].{partition_key[0]}"
525
- ),
526
- nativeDataType="string",
527
- type=SchemaFieldDataTypeClass(StringTypeClass()),
528
- isPartitioningKey=True,
529
- nullable=True,
530
- recursive=False,
531
- )
532
- )
533
-
534
515
  def get_table_profile(
535
516
  self, table_data: TableData, dataset_urn: str
536
517
  ) -> Iterable[MetadataWorkUnit]:
@@ -674,11 +655,9 @@ class S3Source(StatefulIngestionSourceBase):
674
655
  aspects: List[Optional[_Aspect]] = []
675
656
 
676
657
  logger.info(f"Extracting table schema from file: {table_data.full_path}")
677
- browse_path: str = (
678
- strip_s3_prefix(table_data.table_path)
679
- if self.is_s3_platform()
680
- else table_data.table_path.strip("/")
681
- )
658
+
659
+ # remove protocol and any leading or trailing slashes
660
+ browse_path = re.sub(URI_SCHEME_REGEX, "", table_data.table_path).strip("/")
682
661
 
683
662
  data_platform_urn = make_data_platform_urn(self.source_config.platform)
684
663
  logger.info(f"Creating dataset urn with name: {browse_path}")
@@ -812,10 +791,20 @@ class S3Source(StatefulIngestionSourceBase):
812
791
  else:
813
792
  return relative_path
814
793
 
815
- def extract_table_name(self, path_spec: PathSpec, named_vars: dict) -> str:
816
- if path_spec.table_name is None:
817
- raise ValueError("path_spec.table_name is not set")
818
- return path_spec.table_name.format_map(named_vars)
794
+ def extract_table_name_and_path(
795
+ self, path_spec: PathSpec, path: str
796
+ ) -> Tuple[str, str]:
797
+ # Extract the table name and base path from a path that's been normalized back to the
798
+ # "s3://" scheme that matches the path_spec
799
+ table_name, table_path = path_spec.extract_table_name_and_path(
800
+ self._normalize_uri_for_pattern_matching(path)
801
+ )
802
+ # Then convert the table base path back to the original scheme
803
+ scheme = re.match(URI_SCHEME_REGEX, path)
804
+ if scheme:
805
+ table_path = re.sub(URI_SCHEME_REGEX, scheme[0], table_path)
806
+
807
+ return table_name, table_path
819
808
 
820
809
  def extract_table_data(
821
810
  self,
@@ -825,7 +814,7 @@ class S3Source(StatefulIngestionSourceBase):
825
814
  path = browse_path.file
826
815
  partitions = browse_path.partitions
827
816
  logger.debug(f"Getting table data for path: {path}")
828
- table_name, table_path = path_spec.extract_table_name_and_path(path)
817
+ table_name, table_path = self.extract_table_name_and_path(path_spec, path)
829
818
  return TableData(
830
819
  display_name=table_name,
831
820
  is_s3=self.is_s3_platform(),
@@ -849,72 +838,91 @@ class S3Source(StatefulIngestionSourceBase):
849
838
  content_type=browse_path.content_type,
850
839
  )
851
840
 
852
- def resolve_templated_folders(self, bucket_name: str, prefix: str) -> Iterable[str]:
841
+ def resolve_templated_folders(self, prefix: str) -> Iterable[str]:
853
842
  folder_split: List[str] = prefix.split("*", 1)
854
843
  # If the len of split is 1 it means we don't have * in the prefix
855
844
  if len(folder_split) == 1:
856
845
  yield prefix
857
846
  return
858
847
 
859
- folders: Iterable[str] = list_folders(
860
- bucket_name, folder_split[0], self.source_config.aws_config
848
+ basename_startswith = folder_split[0].split("/")[-1]
849
+ dirname = folder_split[0].removesuffix(basename_startswith)
850
+
851
+ folders = list_folders_path(
852
+ dirname,
853
+ startswith=basename_startswith,
854
+ aws_config=self.source_config.aws_config,
861
855
  )
862
856
  for folder in folders:
857
+ # Ensure proper path joining - folders from list_folders path never include a
858
+ # trailing slash, but we need to handle the case where folder_split[1] might
859
+ # start with a slash
860
+ remaining_pattern = folder_split[1]
861
+ if remaining_pattern.startswith("/"):
862
+ remaining_pattern = remaining_pattern[1:]
863
+
863
864
  yield from self.resolve_templated_folders(
864
- bucket_name, f"{folder}{folder_split[1]}"
865
+ f"{folder.path}/{remaining_pattern}"
865
866
  )
866
867
 
867
868
  def get_dir_to_process(
868
869
  self,
869
- bucket_name: str,
870
- folder: str,
870
+ uri: str,
871
871
  path_spec: PathSpec,
872
- protocol: str,
873
872
  min: bool = False,
874
873
  ) -> List[str]:
875
- # if len(path_spec.include.split("/")) == len(f"{protocol}{bucket_name}/{folder}".split("/")):
876
- # return [f"{protocol}{bucket_name}/{folder}"]
877
-
878
- iterator = list_folders(
879
- bucket_name=bucket_name,
880
- prefix=folder,
874
+ # Add any remaining parts of the path_spec before globs, excluding the
875
+ # final filename component, to the URI and prefix so that we don't
876
+ # unnecessarily list too many objects.
877
+ if not uri.endswith("/"):
878
+ uri += "/"
879
+ remaining = posixpath.dirname(path_spec.get_remaining_glob_include(uri)).split(
880
+ "*"
881
+ )[0]
882
+ uri += posixpath.dirname(remaining)
883
+ prefix = posixpath.basename(remaining)
884
+
885
+ # Check if we're at the end of the include path. If so, no need to list sub-folders.
886
+ if path_spec.has_correct_number_of_directory_components(uri):
887
+ return [uri]
888
+
889
+ logger.debug(f"get_dir_to_process listing folders {uri=} {prefix=}")
890
+ iterator = list_folders_path(
891
+ s3_uri=uri,
892
+ startswith=prefix,
881
893
  aws_config=self.source_config.aws_config,
882
894
  )
883
- iterator = peekable(iterator)
884
- if iterator:
885
- sorted_dirs = sorted(
886
- iterator,
887
- key=functools.cmp_to_key(partitioned_folder_comparator),
888
- reverse=not min,
889
- )
890
- folders = []
891
- for dir in sorted_dirs:
892
- if path_spec.dir_allowed(f"{protocol}{bucket_name}/{dir}/"):
893
- folders_list = self.get_dir_to_process(
894
- bucket_name=bucket_name,
895
- folder=dir + "/",
896
- path_spec=path_spec,
897
- protocol=protocol,
898
- min=min,
899
- )
900
- folders.extend(folders_list)
901
- if path_spec.traversal_method != FolderTraversalMethod.ALL:
902
- return folders
903
- if folders:
904
- return folders
905
- else:
906
- return [f"{protocol}{bucket_name}/{folder}"]
907
- return [f"{protocol}{bucket_name}/{folder}"]
895
+ sorted_dirs = sorted(
896
+ iterator,
897
+ key=lambda dir: functools.cmp_to_key(partitioned_folder_comparator)(
898
+ dir.name
899
+ ),
900
+ reverse=not min,
901
+ )
902
+ folders = []
903
+ for dir in sorted_dirs:
904
+ if path_spec.dir_allowed(dir.path):
905
+ folders_list = self.get_dir_to_process(
906
+ uri=dir.path,
907
+ path_spec=path_spec,
908
+ min=min,
909
+ )
910
+ folders.extend(folders_list)
911
+ if path_spec.traversal_method != FolderTraversalMethod.ALL:
912
+ return folders
913
+ if folders:
914
+ return folders
915
+ else:
916
+ return [uri]
908
917
 
909
918
  def get_folder_info(
910
919
  self,
911
920
  path_spec: PathSpec,
912
- bucket: "Bucket",
913
- prefix: str,
921
+ uri: str,
914
922
  ) -> Iterable[Folder]:
915
923
  """
916
- Retrieves all the folders in a path by listing all the files in the prefix.
917
- If the prefix is a full path then only that folder will be extracted.
924
+ Retrieves all the folders in a path by recursively listing all the files under the
925
+ given URI.
918
926
 
919
927
  A folder has creation and modification times, size, and a sample file path.
920
928
  - Creation time is the earliest creation time of all files in the folder.
@@ -924,72 +932,174 @@ class S3Source(StatefulIngestionSourceBase):
924
932
 
925
933
  Parameters:
926
934
  path_spec (PathSpec): The path specification used to determine partitioning.
927
- bucket (Bucket): The S3 bucket object.
928
- prefix (str): The prefix path in the S3 bucket to list objects from.
935
+ uri (str): The path in the S3 bucket to list objects from.
929
936
 
930
937
  Returns:
931
938
  List[Folder]: A list of Folder objects representing the partitions found.
932
939
  """
933
940
 
934
941
  def _is_allowed_path(path_spec_: PathSpec, s3_uri: str) -> bool:
935
- allowed = path_spec_.allowed(s3_uri)
942
+ # Normalize URI for pattern matching
943
+ normalized_uri = self._normalize_uri_for_pattern_matching(s3_uri)
944
+
945
+ allowed = path_spec_.allowed(normalized_uri)
936
946
  if not allowed:
937
947
  logger.debug(f"File {s3_uri} not allowed and skipping")
938
948
  self.report.report_file_dropped(s3_uri)
939
949
  return allowed
940
950
 
941
- s3_objects = (
942
- obj
943
- for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
944
- if _is_allowed_path(
945
- path_spec, self.create_s3_path(obj.bucket_name, obj.key)
951
+ # Add any remaining parts of the path_spec before globs to the URI and prefix,
952
+ # so that we don't unnecessarily list too many objects.
953
+ if not uri.endswith("/"):
954
+ uri += "/"
955
+ remaining = path_spec.get_remaining_glob_include(uri).split("*")[0]
956
+ uri += posixpath.dirname(remaining)
957
+ prefix = posixpath.basename(remaining)
958
+
959
+ # Process objects in a memory-efficient streaming fashion
960
+ # Instead of loading all objects into memory, we'll accumulate folder data incrementally
961
+ folder_data: Dict[str, FolderInfo] = {} # dirname -> FolderInfo
962
+
963
+ logger.info(f"Listing objects under {repr(uri)} with {prefix=}")
964
+
965
+ for obj in list_objects_recursive_path(
966
+ uri, startswith=prefix, aws_config=self.source_config.aws_config
967
+ ):
968
+ s3_path = self.create_s3_path(obj.bucket_name, obj.key)
969
+
970
+ if not _is_allowed_path(path_spec, s3_path):
971
+ continue
972
+
973
+ # Extract the directory name (folder) from the object key
974
+ dirname = obj.key.rsplit("/", 1)[0]
975
+
976
+ # Initialize folder data if we haven't seen this directory before
977
+ if dirname not in folder_data:
978
+ folder_data[dirname] = FolderInfo(
979
+ objects=[],
980
+ total_size=0,
981
+ min_time=obj.last_modified,
982
+ max_time=obj.last_modified,
983
+ latest_obj=obj,
984
+ )
985
+
986
+ # Update folder statistics incrementally
987
+ folder_info = folder_data[dirname]
988
+ folder_info.objects.append(obj)
989
+ folder_info.total_size += obj.size
990
+
991
+ # Track min/max times and latest object
992
+ if obj.last_modified < folder_info.min_time:
993
+ folder_info.min_time = obj.last_modified
994
+ if obj.last_modified > folder_info.max_time:
995
+ folder_info.max_time = obj.last_modified
996
+ folder_info.latest_obj = obj
997
+
998
+ # Yield folders after processing all objects
999
+ for _dirname, folder_info in folder_data.items():
1000
+ latest_obj = folder_info.latest_obj
1001
+ max_file_s3_path = self.create_s3_path(
1002
+ latest_obj.bucket_name, latest_obj.key
946
1003
  )
947
- )
948
- grouped_s3_objects_by_dirname = groupby_unsorted(
949
- s3_objects,
950
- key=lambda obj: obj.key.rsplit("/", 1)[0],
951
- )
952
- for _, group in grouped_s3_objects_by_dirname:
953
- max_file = max(group, key=lambda x: x.last_modified)
954
- max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
955
1004
 
956
1005
  # If partition_id is None, it means the folder is not a partition
957
- partition_id = path_spec.get_partition_from_path(max_file_s3_path)
1006
+ partition_id = path_spec.get_partition_from_path(
1007
+ self._normalize_uri_for_pattern_matching(max_file_s3_path)
1008
+ )
958
1009
 
959
1010
  yield Folder(
960
1011
  partition_id=partition_id,
961
1012
  is_partition=bool(partition_id),
962
- creation_time=min(obj.last_modified for obj in group),
963
- modification_time=max_file.last_modified,
1013
+ creation_time=folder_info.min_time,
1014
+ modification_time=folder_info.max_time,
964
1015
  sample_file=max_file_s3_path,
965
- size=sum(obj.size for obj in group),
1016
+ size=folder_info.total_size,
966
1017
  )
967
1018
 
1019
+ def create_s3_path(self, bucket_name: str, key: str) -> str:
1020
+ return f"s3://{bucket_name}/{key}"
1021
+
968
1022
  def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
1023
+ """
1024
+ Main entry point for browsing S3 objects and creating table-level datasets.
1025
+
1026
+ This method determines whether to use templated processing (for paths with {table})
1027
+ or simple file-by-file processing (for paths without templates).
1028
+
1029
+ Args:
1030
+ path_spec: Configuration specifying the S3 path pattern to scan
1031
+ sample_size: Number of files to sample (used in simple processing)
1032
+
1033
+ Returns:
1034
+ Iterator of BrowsePath objects representing datasets to be created
1035
+
1036
+ Examples:
1037
+ - Templated: s3://bucket/data/*/{table}/** -> Groups files by table
1038
+ - Simple: s3://bucket/data/*.csv -> Processes individual files
1039
+ """
1040
+ if self.source_config.aws_config is None:
1041
+ raise ValueError("aws_config not set. Cannot browse s3")
1042
+
1043
+ logger.info(f"Processing path spec: {path_spec.include}")
1044
+
1045
+ # Check if we have {table} template in the path
1046
+ has_table_template = "{table}" in path_spec.include
1047
+
1048
+ logger.info(f"Has table template: {has_table_template}")
1049
+
1050
+ if has_table_template:
1051
+ logger.info("Using templated path processing")
1052
+ # Always use templated processing when {table} is present
1053
+ # This groups files under table-level datasets
1054
+ yield from self._process_templated_path(path_spec)
1055
+ else:
1056
+ logger.info("Using simple path processing")
1057
+ # Only use simple processing for non-templated paths
1058
+ # This creates individual file-level datasets
1059
+ yield from self._process_simple_path(path_spec)
1060
+
1061
+ def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
1062
+ """
1063
+ Process S3 paths containing {table} templates to create table-level datasets.
1064
+
1065
+ This method handles complex path patterns with wildcards and templates by:
1066
+ 1. Replacing template placeholders with stars (except {table})
1067
+ 2. Resolving wildcards in the path up to the {table} marker
1068
+ 3. Finding all potential table folders under each resolved path
1069
+ 4. Applying configurable partition traversal strategy (ALL, MAX, MIN_MAX)
1070
+ 5. Aggregating files from selected partitions under each table
1071
+ 6. Creating one dataset per table (not per file)
1072
+
1073
+ Args:
1074
+ path_spec: Path specification with {table} template
1075
+
1076
+ Yields:
1077
+ BrowsePath: One per table (not per file), containing aggregated metadata
1078
+ """
1079
+
969
1080
  if self.source_config.aws_config is None:
970
1081
  raise ValueError("aws_config not set. Cannot browse s3")
971
1082
  s3 = self.source_config.aws_config.get_s3_resource(
972
1083
  self.source_config.verify_ssl
973
1084
  )
974
- bucket_name = get_bucket_name(path_spec.include)
975
- logger.debug(f"Scanning bucket: {bucket_name}")
976
- bucket = s3.Bucket(bucket_name)
977
- prefix = self.get_prefix(get_bucket_relative_path(path_spec.include))
978
- logger.debug(f"Scanning objects with prefix:{prefix}")
1085
+
1086
+ # Find the part before {table}
1087
+ table_marker = "{table}"
1088
+ if table_marker not in path_spec.include:
1089
+ logger.info("No {table} marker found in path")
1090
+ return
1091
+
1092
+ # STEP 1: Replace template placeholders with stars (except {table}) to enable folder resolution
1093
+ # This is the crucial missing logic from the original implementation
979
1094
  matches = re.finditer(r"{\s*\w+\s*}", path_spec.include, re.MULTILINE)
980
1095
  matches_list = list(matches)
981
- if matches_list and path_spec.sample_files:
982
- # Replace the patch_spec include's templates with star because later we want to resolve all the stars
983
- # to actual directories.
984
- # For example:
985
- # "s3://my-test-bucket/*/{dept}/*/{table}/*/*.*" -> "s3://my-test-bucket/*/*/*/{table}/*/*.*"
986
- # We only keep the last template as a marker to know the point util we need to resolve path.
987
- # After the marker we can safely get sample files for sampling because it is not used in the
988
- # table name, so we don't need all the files.
989
- # This speed up processing but we won't be able to get a precise modification date/size/number of files.
1096
+
1097
+ if matches_list:
1098
+ # Replace all templates with stars except keep {table} as the marker
990
1099
  max_start: int = -1
991
1100
  include: str = path_spec.include
992
1101
  max_match: str = ""
1102
+
993
1103
  for match in matches_list:
994
1104
  pos = include.find(match.group())
995
1105
  if pos > max_start:
@@ -1001,120 +1111,198 @@ class S3Source(StatefulIngestionSourceBase):
1001
1111
  if max_match == "{table}":
1002
1112
  break
1003
1113
 
1004
- table_index = include.find(max_match)
1005
- for folder in self.resolve_templated_folders(
1006
- bucket_name, get_bucket_relative_path(include[:table_index])
1007
- ):
1008
- try:
1009
- for f in list_folders(
1010
- bucket_name, f"{folder}", self.source_config.aws_config
1011
- ):
1012
- table_path = self.create_s3_path(bucket_name, f)
1013
- table_name, _ = path_spec.extract_table_name_and_path(
1014
- table_path
1114
+ logger.info(f"Template replacement: {path_spec.include} -> {include}")
1115
+ else:
1116
+ include = path_spec.include
1117
+
1118
+ # Split the path at {table} to get the prefix that needs wildcard resolution
1119
+ prefix_before_table = include.split(table_marker)[0]
1120
+ logger.info(f"Prefix before table: {prefix_before_table}")
1121
+
1122
+ try:
1123
+ # STEP 2: Resolve ALL wildcards in the path up to {table}
1124
+ # This converts patterns like "s3://data/*/logs/" to actual paths like ["s3://data/2023/logs/", "s3://data/2024/logs/"]
1125
+ resolved_prefixes = list(
1126
+ self.resolve_templated_folders(prefix_before_table)
1127
+ )
1128
+ logger.info(f"Resolved prefixes: {resolved_prefixes}")
1129
+
1130
+ # STEP 3: Process each resolved prefix to find table folders
1131
+ for resolved_prefix in resolved_prefixes:
1132
+ logger.info(f"Processing resolved prefix: {resolved_prefix}")
1133
+
1134
+ # Get all folders that could be tables under this resolved prefix
1135
+ # These are the actual table names (e.g., "users", "events", "logs")
1136
+ table_folders = list(
1137
+ list_folders_path(
1138
+ resolved_prefix, aws_config=self.source_config.aws_config
1139
+ )
1140
+ )
1141
+ logger.debug(
1142
+ f"Found table folders under {resolved_prefix}: {[folder.name for folder in table_folders]}"
1143
+ )
1144
+
1145
+ # STEP 4: Process each table folder to create a table-level dataset
1146
+ for folder in table_folders:
1147
+ logger.info(f"Processing table path: {folder.path}")
1148
+
1149
+ # Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
1150
+ # This uses the compiled regex pattern to extract the table name from the full path
1151
+ table_name, _ = self.extract_table_name_and_path(
1152
+ path_spec, folder.path
1153
+ )
1154
+
1155
+ # Apply table name filtering if configured
1156
+ if not path_spec.tables_filter_pattern.allowed(table_name):
1157
+ logger.debug(f"Table '{table_name}' not allowed and skipping")
1158
+ continue
1159
+
1160
+ # STEP 5: Handle partition traversal based on configuration
1161
+ dirs_to_process = []
1162
+
1163
+ if path_spec.traversal_method == FolderTraversalMethod.ALL:
1164
+ # Process ALL partitions (original behavior)
1165
+ dirs_to_process = [folder.path]
1166
+ logger.debug(
1167
+ f"Processing ALL partition folders under: {folder.path}"
1015
1168
  )
1016
- if not path_spec.tables_filter_pattern.allowed(table_name):
1017
- logger.debug(
1018
- f"Table '{table_name}' not allowed and skipping"
1169
+
1170
+ else:
1171
+ # Use the original get_dir_to_process logic for MIN/MAX
1172
+ if (
1173
+ path_spec.traversal_method == FolderTraversalMethod.MIN_MAX
1174
+ or path_spec.traversal_method == FolderTraversalMethod.MAX
1175
+ ):
1176
+ # Get MAX partition using original logic
1177
+ dirs_to_process_max = self.get_dir_to_process(
1178
+ uri=folder.path,
1179
+ path_spec=path_spec,
1180
+ min=False,
1019
1181
  )
1020
- self.report.report_file_dropped(table_path)
1021
- continue
1022
-
1023
- dirs_to_process = []
1024
- logger.info(f"Processing folder: {f}")
1025
- if path_spec.traversal_method == FolderTraversalMethod.ALL:
1026
- dirs_to_process.append(f)
1027
- else:
1028
- if (
1029
- path_spec.traversal_method
1030
- == FolderTraversalMethod.MIN_MAX
1031
- or path_spec.traversal_method
1032
- == FolderTraversalMethod.MAX
1033
- ):
1034
- protocol = ContainerWUCreator.get_protocol(
1035
- path_spec.include
1036
- )
1037
- dirs_to_process_max = self.get_dir_to_process(
1038
- bucket_name=bucket_name,
1039
- folder=f + "/",
1040
- path_spec=path_spec,
1041
- protocol=protocol,
1182
+ if dirs_to_process_max:
1183
+ dirs_to_process.extend(dirs_to_process_max)
1184
+ logger.debug(
1185
+ f"Added MAX partition: {dirs_to_process_max}"
1042
1186
  )
1043
- dirs_to_process.append(dirs_to_process_max[0])
1044
-
1045
- if (
1046
- path_spec.traversal_method
1047
- == FolderTraversalMethod.MIN_MAX
1048
- ):
1049
- dirs_to_process_min = self.get_dir_to_process(
1050
- bucket_name=bucket_name,
1051
- folder=f + "/",
1052
- path_spec=path_spec,
1053
- protocol=protocol,
1054
- min=True,
1055
- )
1056
- dirs_to_process.append(dirs_to_process_min[0])
1057
- folders: List[Folder] = []
1058
- for dir in dirs_to_process:
1059
- logger.info(f"Getting files from folder: {dir}")
1060
- prefix_to_process = urlparse(dir).path.lstrip("/")
1061
-
1062
- folders.extend(
1063
- self.get_folder_info(
1064
- path_spec, bucket, prefix_to_process
1065
- )
1066
- )
1067
- max_folder = None
1068
- if folders:
1069
- max_folder = max(folders, key=lambda x: x.modification_time)
1070
- if not max_folder:
1071
- logger.warning(
1072
- f"Unable to find any files in the folder {dir}. Skipping..."
1187
+
1188
+ if path_spec.traversal_method == FolderTraversalMethod.MIN_MAX:
1189
+ # Get MIN partition using original logic
1190
+ dirs_to_process_min = self.get_dir_to_process(
1191
+ uri=folder.path,
1192
+ path_spec=path_spec,
1193
+ min=True,
1073
1194
  )
1074
- continue
1195
+ if dirs_to_process_min:
1196
+ dirs_to_process.extend(dirs_to_process_min)
1197
+ logger.debug(
1198
+ f"Added MIN partition: {dirs_to_process_min}"
1199
+ )
1075
1200
 
1076
- partitions = list(filter(lambda x: x.is_partition, folders))
1077
- yield BrowsePath(
1078
- file=max_folder.sample_file,
1079
- timestamp=max_folder.modification_time,
1080
- size=max_folder.size,
1081
- partitions=partitions,
1082
- # TODO: Support content type inference for partitions
1201
+ # Process the selected partitions
1202
+ all_folders = []
1203
+ for partition_path in dirs_to_process:
1204
+ logger.info(f"Scanning files in partition: {partition_path}")
1205
+ partition_files = list(
1206
+ self.get_folder_info(path_spec, partition_path)
1083
1207
  )
1084
- except Exception as e:
1085
- # This odd check if being done because boto does not have a proper exception to catch
1086
- # The exception that appears in stacktrace cannot actually be caught without a lot more work
1087
- # https://github.com/boto/boto3/issues/1195
1088
- if "NoSuchBucket" in repr(e):
1089
- logger.debug(f"Got NoSuchBucket exception for {bucket_name}", e)
1090
- self.get_report().report_warning(
1091
- "Missing bucket", f"No bucket found {bucket_name}"
1208
+ all_folders.extend(partition_files)
1209
+
1210
+ if all_folders:
1211
+ # Use the most recent file across all processed partitions
1212
+ latest_file = max(
1213
+ all_folders, key=lambda x: x.modification_time
1214
+ )
1215
+
1216
+ # Get partition information
1217
+ partitions = [f for f in all_folders if f.is_partition]
1218
+
1219
+ # Calculate total size of processed partitions
1220
+ total_size = sum(f.size for f in all_folders)
1221
+
1222
+ # Create ONE BrowsePath per table
1223
+ # The key insight: we need to provide the sample file for schema inference
1224
+ # but the table path should be extracted correctly by extract_table_name_and_path
1225
+ yield BrowsePath(
1226
+ file=latest_file.sample_file, # Sample file for schema inference
1227
+ timestamp=latest_file.modification_time, # Latest timestamp
1228
+ size=total_size, # Size of processed partitions
1229
+ partitions=partitions, # Partition metadata
1092
1230
  )
1093
1231
  else:
1094
- raise e
1095
- else:
1096
- logger.debug(
1097
- "No template in the pathspec can't do sampling, fallbacking to do full scan"
1098
- )
1099
- path_spec.sample_files = False
1100
- for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
1101
- s3_path = self.create_s3_path(obj.bucket_name, obj.key)
1102
- logger.debug(f"Path: {s3_path}")
1103
-
1104
- content_type = None
1105
- if self.source_config.use_s3_content_type:
1106
- content_type = s3.Object(obj.bucket_name, obj.key).content_type
1107
-
1108
- yield BrowsePath(
1109
- file=s3_path,
1110
- timestamp=obj.last_modified,
1111
- size=obj.size,
1112
- partitions=[],
1113
- content_type=content_type,
1232
+ logger.warning(
1233
+ f"No files found in processed partitions for table {table_name}"
1234
+ )
1235
+
1236
+ except Exception as e:
1237
+ if isinstance(e, s3.meta.client.exceptions.NoSuchBucket):
1238
+ self.get_report().report_warning(
1239
+ "Missing bucket",
1240
+ f"No bucket found {e.response['Error'].get('BucketName')}",
1114
1241
  )
1242
+ return
1243
+ logger.error(f"Error in _process_templated_path: {e}")
1244
+ raise e
1115
1245
 
1116
- def create_s3_path(self, bucket_name: str, key: str) -> str:
1117
- return f"s3://{bucket_name}/{key}"
1246
+ def _process_simple_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
1247
+ """
1248
+ Process simple S3 paths without {table} templates to create file-level datasets.
1249
+
1250
+ This method handles straightforward file patterns by:
1251
+ 1. Listing all files matching the pattern
1252
+ 2. Creating one dataset per file
1253
+ 3. No aggregation or grouping is performed
1254
+
1255
+ Use Cases:
1256
+ - Individual file processing: s3://bucket/data/*.csv
1257
+ - Direct file paths: s3://bucket/data/myfile.json
1258
+ - Patterns without table grouping: s3://bucket/logs/*.log
1259
+
1260
+ Args:
1261
+ path_spec: Path specification without {table} template
1262
+
1263
+ Yields:
1264
+ BrowsePath: One per file, containing individual file metadata
1265
+
1266
+ Example Output:
1267
+ - BrowsePath(file="data/file1.csv", size=1000, partitions=[])
1268
+ - BrowsePath(file="data/file2.csv", size=2000, partitions=[])
1269
+ """
1270
+
1271
+ if self.source_config.aws_config is None:
1272
+ raise ValueError("aws_config not set")
1273
+ s3 = self.source_config.aws_config.get_s3_resource(
1274
+ self.source_config.verify_ssl
1275
+ )
1276
+
1277
+ path_spec.sample_files = False # Disable sampling for simple paths
1278
+
1279
+ # Extract the prefix from the path spec (stops at first wildcard)
1280
+ prefix = self.get_prefix(path_spec.include)
1281
+
1282
+ basename_startswith = prefix.split("/")[-1]
1283
+ dirname = prefix.removesuffix(basename_startswith)
1284
+
1285
+ # Iterate through all objects in the bucket matching the prefix
1286
+ for obj in list_objects_recursive_path(
1287
+ dirname,
1288
+ startswith=basename_startswith,
1289
+ aws_config=self.source_config.aws_config,
1290
+ ):
1291
+ s3_path = self.create_s3_path(obj.bucket_name, obj.key)
1292
+
1293
+ # Get content type if configured
1294
+ content_type = None
1295
+ if self.source_config.use_s3_content_type:
1296
+ content_type = s3.Object(obj.bucket_name, obj.key).content_type
1297
+
1298
+ # Create one BrowsePath per file
1299
+ yield BrowsePath(
1300
+ file=s3_path,
1301
+ timestamp=obj.last_modified,
1302
+ size=obj.size,
1303
+ partitions=[], # No partitions in simple mode
1304
+ content_type=content_type,
1305
+ )
1118
1306
 
1119
1307
  def local_browser(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
1120
1308
  prefix = self.get_prefix(path_spec.include)
@@ -1158,8 +1346,13 @@ class S3Source(StatefulIngestionSourceBase):
1158
1346
  )
1159
1347
  table_dict: Dict[str, TableData] = {}
1160
1348
  for browse_path in file_browser:
1349
+ # Normalize URI for pattern matching
1350
+ normalized_file_path = self._normalize_uri_for_pattern_matching(
1351
+ browse_path.file
1352
+ )
1353
+
1161
1354
  if not path_spec.allowed(
1162
- browse_path.file,
1355
+ normalized_file_path,
1163
1356
  ignore_ext=self.is_s3_platform()
1164
1357
  and self.source_config.use_s3_content_type,
1165
1358
  ):
@@ -1235,5 +1428,13 @@ class S3Source(StatefulIngestionSourceBase):
1235
1428
  def is_s3_platform(self):
1236
1429
  return self.source_config.platform == "s3"
1237
1430
 
1431
+ def strip_s3_prefix(self, s3_uri: str) -> str:
1432
+ """Strip S3 prefix from URI. Can be overridden by adapters for other platforms."""
1433
+ return strip_s3_prefix(s3_uri)
1434
+
1435
+ def _normalize_uri_for_pattern_matching(self, uri: str) -> str:
1436
+ """Normalize URI for pattern matching. Can be overridden by adapters for other platforms."""
1437
+ return uri
1438
+
1238
1439
  def get_report(self):
1239
1440
  return self.report