acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,9 @@ from datahub.configuration.validate_multiline_string import pydantic_multiline_s
9
9
 
10
10
 
11
11
  class GCPCredential(ConfigModel):
12
- project_id: Optional[str] = Field(description="Project id to set the credentials")
12
+ project_id: Optional[str] = Field(
13
+ None, description="Project id to set the credentials"
14
+ )
13
15
  private_key_id: str = Field(description="Private key id")
14
16
  private_key: str = Field(
15
17
  description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"
@@ -1,5 +1,10 @@
1
+ import logging
2
+ from typing import Any, Dict
3
+
1
4
  from datahub.utilities.str_enum import StrEnum
2
5
 
6
+ logger = logging.getLogger(__name__)
7
+
3
8
 
4
9
  class DatasetSubTypes(StrEnum):
5
10
  # Generic SubTypes
@@ -25,7 +30,12 @@ class DatasetSubTypes(StrEnum):
25
30
  NEO4J_NODE = "Neo4j Node"
26
31
  NEO4J_RELATIONSHIP = "Neo4j Relationship"
27
32
  SNOWFLAKE_STREAM = "Snowflake Stream"
33
+ DYNAMIC_TABLE = "Dynamic Table"
28
34
  API_ENDPOINT = "API Endpoint"
35
+ SLACK_CHANNEL = "Slack Channel"
36
+ PROJECTIONS = "Projections"
37
+ GOOGLE_SHEETS = "Google Sheets"
38
+ GOOGLE_SHEETS_NAMED_RANGE = "Google Sheets Named Range"
29
39
 
30
40
  # TODO: Create separate entity...
31
41
  NOTEBOOK = "Notebook"
@@ -46,13 +56,18 @@ class DatasetContainerSubTypes(StrEnum):
46
56
  ABS_CONTAINER = "ABS container"
47
57
  KEYSPACE = "Keyspace" # Cassandra
48
58
  NAMESPACE = "Namespace" # Iceberg
59
+ DREMIO_SPACE = "Dremio Space"
60
+ DREMIO_SOURCE = "Dremio Source"
49
61
 
50
62
 
51
63
  class BIContainerSubTypes(StrEnum):
52
64
  LOOKER_FOLDER = "Folder"
53
65
  LOOKML_PROJECT = "LookML Project"
54
66
  LOOKML_MODEL = "LookML Model"
67
+ TABLEAU_SITE = "Site"
68
+ TABLEAU_PROJECT = "Project"
55
69
  TABLEAU_WORKBOOK = "Workbook"
70
+ POWERBI_WORKSPACE = "Workspace"
56
71
  POWERBI_DATASET = "Semantic Model"
57
72
  POWERBI_DATASET_TABLE = "Table"
58
73
  QLIK_SPACE = "Qlik Space"
@@ -60,6 +75,8 @@ class BIContainerSubTypes(StrEnum):
60
75
  SIGMA_WORKSPACE = "Sigma Workspace"
61
76
  SIGMA_WORKBOOK = "Sigma Workbook"
62
77
  MODE_COLLECTION = "Collection"
78
+ GRAFANA_FOLDER = "Folder"
79
+ GRAFANA_DASHBOARD = "Dashboard"
63
80
 
64
81
 
65
82
  class FlowContainerSubTypes(StrEnum):
@@ -74,6 +91,9 @@ class JobContainerSubTypes(StrEnum):
74
91
 
75
92
 
76
93
  class BIAssetSubTypes(StrEnum):
94
+ DASHBOARD = "Dashboard"
95
+ CHART = "Chart"
96
+
77
97
  # Generic SubTypes
78
98
  REPORT = "Report"
79
99
 
@@ -116,3 +136,36 @@ class MLAssetSubTypes(StrEnum):
116
136
  VERTEX_PIPELINE = "Pipeline Job"
117
137
  VERTEX_PIPELINE_TASK = "Pipeline Task"
118
138
  VERTEX_PIPELINE_TASK_RUN = "Pipeline Task Run"
139
+
140
+
141
+ def create_source_capability_modifier_enum():
142
+ all_values: Dict[str, Any] = {}
143
+ source_enums = [
144
+ DatasetSubTypes,
145
+ DatasetContainerSubTypes,
146
+ BIContainerSubTypes,
147
+ FlowContainerSubTypes,
148
+ JobContainerSubTypes,
149
+ BIAssetSubTypes,
150
+ MLAssetSubTypes,
151
+ ]
152
+
153
+ for enum_class in source_enums:
154
+ for member in enum_class: # type: ignore[var-annotated]
155
+ if member.name in all_values:
156
+ logger.debug(
157
+ f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
158
+ )
159
+ continue
160
+ all_values[member.name] = member.value
161
+
162
+ enum_code = "class SourceCapabilityModifier(StrEnum):\n"
163
+ for name, value in all_values.items():
164
+ enum_code += f' {name} = "{value}"\n'
165
+
166
+ exec(enum_code, globals())
167
+ return globals()["SourceCapabilityModifier"]
168
+
169
+
170
+ # This will have all values from the enums above
171
+ SourceCapabilityModifier = create_source_capability_modifier_enum()
@@ -25,10 +25,16 @@ from datahub.ingestion.source.data_lake_common.object_store import (
25
25
  get_object_store_bucket_name,
26
26
  get_object_store_for_uri,
27
27
  )
28
+ from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
28
29
  from datahub.ingestion.source.gcs.gcs_utils import (
29
30
  get_gcs_prefix,
30
31
  is_gcs_uri,
31
32
  )
33
+ from datahub.metadata.schema_classes import (
34
+ SchemaFieldClass,
35
+ SchemaFieldDataTypeClass,
36
+ StringTypeClass,
37
+ )
32
38
 
33
39
  # hide annoying debug errors from py4j
34
40
  logging.getLogger("py4j").setLevel(logging.ERROR)
@@ -39,6 +45,37 @@ PLATFORM_GCS = "gcs"
39
45
  PLATFORM_ABS = "abs"
40
46
 
41
47
 
48
+ def add_partition_columns_to_schema(
49
+ path_spec: PathSpec, full_path: str, fields: List[SchemaFieldClass]
50
+ ) -> None:
51
+ # Check if using fieldPath v2 format
52
+ is_fieldpath_v2 = any(
53
+ field.fieldPath.startswith("[version=2.0]") for field in fields
54
+ )
55
+
56
+ # Extract partition information from path
57
+ partition_keys = path_spec.get_partition_from_path(full_path)
58
+ if not partition_keys:
59
+ return
60
+
61
+ # Add partition fields to schema
62
+ for partition_key in partition_keys:
63
+ fields.append(
64
+ SchemaFieldClass(
65
+ fieldPath=(
66
+ f"{partition_key[0]}"
67
+ if not is_fieldpath_v2
68
+ else f"[version=2.0].[type=string].{partition_key[0]}"
69
+ ),
70
+ nativeDataType="string",
71
+ type=SchemaFieldDataTypeClass(StringTypeClass()),
72
+ isPartitioningKey=True,
73
+ nullable=False,
74
+ recursive=False,
75
+ )
76
+ )
77
+
78
+
42
79
  class ContainerWUCreator:
43
80
  processed_containers: List[str]
44
81
 
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from abc import ABC, abstractmethod
2
3
 
3
4
  # Add imports for source customization
@@ -236,42 +237,76 @@ class ABSObjectStore(ObjectStoreInterface):
236
237
  """Implementation of ObjectStoreInterface for Azure Blob Storage."""
237
238
 
238
239
  PREFIX = "abfss://"
240
+ HTTPS_REGEX = re.compile(r"(https?://[a-z0-9]{3,24}\.blob\.core\.windows\.net/)")
239
241
 
240
242
  @classmethod
241
243
  def is_uri(cls, uri: str) -> bool:
242
- return uri.startswith(cls.PREFIX)
244
+ return uri.startswith(cls.PREFIX) or bool(cls.HTTPS_REGEX.match(uri))
243
245
 
244
246
  @classmethod
245
247
  def get_prefix(cls, uri: str) -> Optional[str]:
246
248
  if uri.startswith(cls.PREFIX):
247
249
  return cls.PREFIX
250
+
251
+ # Check for HTTPS format
252
+ match = cls.HTTPS_REGEX.match(uri)
253
+ if match:
254
+ return match.group(1)
255
+
248
256
  return None
249
257
 
250
258
  @classmethod
251
259
  def strip_prefix(cls, uri: str) -> str:
252
- prefix = cls.get_prefix(uri)
253
- if not prefix:
254
- raise ValueError(f"Not an ABS URI. Must start with prefix: {cls.PREFIX}")
255
- return uri[len(prefix) :]
260
+ if uri.startswith(cls.PREFIX):
261
+ return uri[len(cls.PREFIX) :]
262
+
263
+ # Handle HTTPS format
264
+ match = cls.HTTPS_REGEX.match(uri)
265
+ if match:
266
+ return uri[len(match.group(1)) :]
267
+
268
+ raise ValueError(
269
+ f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
270
+ )
256
271
 
257
272
  @classmethod
258
273
  def get_bucket_name(cls, uri: str) -> str:
259
274
  if not cls.is_uri(uri):
260
- raise ValueError(f"Not an ABS URI. Must start with prefix: {cls.PREFIX}")
261
- return cls.strip_prefix(uri).split("@")[0]
275
+ raise ValueError(
276
+ f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
277
+ )
278
+
279
+ if uri.startswith(cls.PREFIX):
280
+ # abfss://container@account.dfs.core.windows.net/path
281
+ return cls.strip_prefix(uri).split("@")[0]
282
+ else:
283
+ # https://account.blob.core.windows.net/container/path
284
+ return cls.strip_prefix(uri).split("/")[0]
262
285
 
263
286
  @classmethod
264
287
  def get_object_key(cls, uri: str) -> str:
265
288
  if not cls.is_uri(uri):
266
- raise ValueError(f"Not an ABS URI. Must start with prefix: {cls.PREFIX}")
267
- parts = cls.strip_prefix(uri).split("@", 1)
268
- if len(parts) < 2:
269
- return ""
270
- account_path = parts[1]
271
- path_parts = account_path.split("/", 1)
272
- if len(path_parts) < 2:
273
- return ""
274
- return path_parts[1]
289
+ raise ValueError(
290
+ f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
291
+ )
292
+
293
+ if uri.startswith(cls.PREFIX):
294
+ # abfss://container@account.dfs.core.windows.net/path
295
+ parts = cls.strip_prefix(uri).split("@", 1)
296
+ if len(parts) < 2:
297
+ return ""
298
+ account_path = parts[1]
299
+ path_parts = account_path.split("/", 1)
300
+ if len(path_parts) < 2:
301
+ return ""
302
+ return path_parts[1]
303
+ else:
304
+ # https://account.blob.core.windows.net/container/path
305
+ stripped = cls.strip_prefix(uri)
306
+ parts = stripped.split("/", 1)
307
+ if len(parts) < 2:
308
+ return ""
309
+ return parts[1]
275
310
 
276
311
 
277
312
  # Registry of all object store implementations
@@ -331,6 +366,12 @@ def get_object_store_bucket_name(uri: str) -> str:
331
366
  return uri[prefix_length:].split("/")[0]
332
367
  elif uri.startswith(ABSObjectStore.PREFIX):
333
368
  return uri[len(ABSObjectStore.PREFIX) :].split("@")[0]
369
+ elif ABSObjectStore.HTTPS_REGEX.match(uri):
370
+ # Handle HTTPS Azure Blob Storage URLs
371
+ match = ABSObjectStore.HTTPS_REGEX.match(uri)
372
+ if match:
373
+ stripped = uri[len(match.group(1)) :]
374
+ return stripped.split("/")[0]
334
375
 
335
376
  raise ValueError(f"Unsupported URI format: {uri}")
336
377
 
@@ -470,18 +511,25 @@ class ObjectStoreSourceAdapter:
470
511
  if not ABSObjectStore.is_uri(table_data.table_path):
471
512
  return None
472
513
 
473
- # Parse the ABS URI
474
514
  try:
475
- # URI format: abfss://container@account.dfs.core.windows.net/path
476
- path_without_prefix = ABSObjectStore.strip_prefix(table_data.table_path)
477
- parts = path_without_prefix.split("@", 1)
478
- if len(parts) < 2:
479
- return None
480
-
481
- container_name = parts[0]
482
- account_parts = parts[1].split("/", 1)
483
- account_domain = account_parts[0]
484
- account_name = account_domain.split(".")[0]
515
+ if table_data.table_path.startswith("abfss://"):
516
+ # URI format: abfss://container@account.dfs.core.windows.net/path
517
+ path_without_prefix = ABSObjectStore.strip_prefix(table_data.table_path)
518
+ parts = path_without_prefix.split("@", 1)
519
+ if len(parts) < 2:
520
+ return None
521
+
522
+ container_name = parts[0]
523
+ account_parts = parts[1].split("/", 1)
524
+ account_domain = account_parts[0]
525
+ account_name = account_domain.split(".")[0]
526
+ else:
527
+ # Handle HTTPS format: https://account.blob.core.windows.net/container/path
528
+ container_name = ABSObjectStore.get_bucket_name(table_data.table_path)
529
+ if "blob.core.windows.net" in table_data.table_path:
530
+ account_name = table_data.table_path.split("//")[1].split(".")[0]
531
+ else:
532
+ return None
485
533
 
486
534
  # Construct Azure portal URL
487
535
  return f"https://portal.azure.com/#blade/Microsoft_Azure_Storage/ContainerMenuBlade/overview/storageAccountId/{account_name}/containerName/{container_name}"
@@ -519,6 +567,13 @@ class ObjectStoreSourceAdapter:
519
567
  "get_external_url",
520
568
  lambda table_data: self.get_gcs_external_url(table_data),
521
569
  )
570
+ # Fix URI mismatch issue in pattern matching
571
+ self.register_customization(
572
+ "_normalize_uri_for_pattern_matching",
573
+ self._normalize_gcs_uri_for_pattern_matching,
574
+ )
575
+ # Fix URI handling in schema extraction - override strip_s3_prefix for GCS
576
+ self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
522
577
  elif platform == "s3":
523
578
  self.register_customization("is_s3_platform", lambda: True)
524
579
  self.register_customization("create_s3_path", self.create_s3_path)
@@ -612,6 +667,39 @@ class ObjectStoreSourceAdapter:
612
667
  return self.get_abs_external_url(table_data)
613
668
  return None
614
669
 
670
+ def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
671
+ """
672
+ Normalize GCS URI for pattern matching.
673
+
674
+ This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
675
+ fixing the URI mismatch issue in GCS ingestion.
676
+
677
+ Args:
678
+ uri: The URI to normalize
679
+
680
+ Returns:
681
+ The normalized URI for pattern matching
682
+ """
683
+ if uri.startswith("gs://"):
684
+ return uri.replace("gs://", "s3://", 1)
685
+ return uri
686
+
687
+ def _strip_gcs_prefix(self, uri: str) -> str:
688
+ """
689
+ Strip GCS prefix from URI.
690
+
691
+ This method removes the gs:// prefix from GCS URIs for path processing.
692
+
693
+ Args:
694
+ uri: The URI to strip the prefix from
695
+
696
+ Returns:
697
+ The URI without the gs:// prefix
698
+ """
699
+ if uri.startswith("gs://"):
700
+ return uri[5:] # Remove "gs://" prefix
701
+ return uri
702
+
615
703
 
616
704
  # Factory function to create an adapter for a specific platform
617
705
  def create_object_store_adapter(
@@ -11,7 +11,7 @@ from cached_property import cached_property
11
11
  from pydantic.fields import Field
12
12
  from wcmatch import pathlib
13
13
 
14
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
14
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
15
15
  from datahub.ingestion.source.aws.s3_util import is_s3_uri
16
16
  from datahub.ingestion.source.azure.abs_utils import is_abs_uri
17
17
  from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri
@@ -62,7 +62,6 @@ class SortKey(ConfigModel):
62
62
 
63
63
  date_format: Optional[str] = Field(
64
64
  default=None,
65
- type=str,
66
65
  description="The date format to use when sorting. This is used to parse the date from the key. The format should follow the java [SimpleDateFormat](https://docs.oracle.com/javase/8/docs/api/java/text/SimpleDateFormat.html) format.",
67
66
  )
68
67
 
@@ -90,63 +89,62 @@ class PathSpec(ConfigModel):
90
89
  description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details."
91
90
  )
92
91
  exclude: Optional[List[str]] = Field(
93
- default=[],
92
+ [],
94
93
  description="list of paths in glob pattern which will be excluded while scanning for the datasets",
95
94
  )
96
95
  file_types: List[str] = Field(
97
- default=SUPPORTED_FILE_TYPES,
96
+ SUPPORTED_FILE_TYPES,
98
97
  description="Files with extenstions specified here (subset of default value) only will be scanned to create dataset. Other files will be omitted.",
99
98
  )
100
99
 
101
100
  default_extension: Optional[str] = Field(
102
- default=None,
101
+ None,
103
102
  description="For files without extension it will assume the specified file type. If it is not set the files without extensions will be skipped.",
104
103
  )
105
104
 
106
105
  table_name: Optional[str] = Field(
107
- default=None,
106
+ None,
108
107
  description="Display name of the dataset.Combination of named variables from include path and strings",
109
108
  )
110
109
 
111
110
  # This is not used yet, but will be used in the future to sort the partitions
112
- sort_key: Optional[SortKey] = Field(
113
- hidden_from_docs=True,
114
- default=None,
111
+ sort_key: HiddenFromDocs[Optional[SortKey]] = Field(
112
+ None,
115
113
  description="Sort key to use when sorting the partitions. This is useful when the partitions are not sorted in the order of the data. The key can be a compound key based on the path_spec variables.",
116
114
  )
117
115
 
118
116
  enable_compression: bool = Field(
119
- default=True,
117
+ True,
120
118
  description="Enable or disable processing compressed files. Currently .gz and .bz files are supported.",
121
119
  )
122
120
 
123
121
  sample_files: bool = Field(
124
- default=True,
122
+ True,
125
123
  description="Not listing all the files but only taking a handful amount of sample file to infer the schema. File count and file size calculation will be disabled. This can affect performance significantly if enabled",
126
124
  )
127
125
 
128
126
  allow_double_stars: bool = Field(
129
- default=False,
127
+ False,
130
128
  description="Allow double stars in the include path. This can affect performance significantly if enabled",
131
129
  )
132
130
 
133
131
  autodetect_partitions: bool = Field(
134
- default=True,
132
+ True,
135
133
  description="Autodetect partition(s) from the path. If set to true, it will autodetect partition key/value if the folder format is {partition_key}={partition_value} for example `year=2024`",
136
134
  )
137
135
 
138
136
  traversal_method: FolderTraversalMethod = Field(
139
- default=FolderTraversalMethod.MAX,
137
+ FolderTraversalMethod.MAX,
140
138
  description="Method to traverse the folder. ALL: Traverse all the folders, MIN_MAX: Traverse the folders by finding min and max value, MAX: Traverse the folder with max value",
141
139
  )
142
140
 
143
141
  include_hidden_folders: bool = Field(
144
- default=False,
142
+ False,
145
143
  description="Include hidden folders in the traversal (folders starting with . or _",
146
144
  )
147
145
 
148
146
  tables_filter_pattern: AllowDenyPattern = Field(
149
- default=AllowDenyPattern.allow_all(),
147
+ AllowDenyPattern.allow_all(),
150
148
  description="The tables_filter_pattern configuration field uses regular expressions to filter the tables part of the Pathspec for ingestion, allowing fine-grained control over which tables are included or excluded based on specified patterns. The default setting allows all tables.",
151
149
  )
152
150
 
@@ -166,7 +164,6 @@ class PathSpec(ConfigModel):
166
164
  return False
167
165
 
168
166
  def allowed(self, path: str, ignore_ext: bool = False) -> bool:
169
- logger.debug(f"Checking file to inclusion: {path}")
170
167
  if self.is_path_hidden(path) and not self.include_hidden_folders:
171
168
  return False
172
169
 
@@ -174,19 +171,17 @@ class PathSpec(ConfigModel):
174
171
  self.glob_include, flags=pathlib.GLOBSTAR
175
172
  ):
176
173
  return False
177
- logger.debug(f"{path} matched include ")
174
+
178
175
  if self.exclude:
179
176
  for exclude_path in self.exclude:
180
177
  if pathlib.PurePath(path).globmatch(
181
178
  exclude_path, flags=pathlib.GLOBSTAR
182
179
  ):
183
180
  return False
184
- logger.debug(f"{path} is not excluded")
185
181
 
186
182
  table_name, _ = self.extract_table_name_and_path(path)
187
183
  if not self.tables_filter_pattern.allowed(table_name):
188
184
  return False
189
- logger.debug(f"{path} is passed table name check")
190
185
 
191
186
  ext = os.path.splitext(path)[1].strip(".")
192
187
 
@@ -196,11 +191,12 @@ class PathSpec(ConfigModel):
196
191
  ):
197
192
  return False
198
193
 
199
- logger.debug(f"{path} had selected extension {ext}")
200
- logger.debug(f"{path} allowed for dataset creation")
201
194
  return True
202
195
 
203
196
  def dir_allowed(self, path: str) -> bool:
197
+ if not path.endswith("/"):
198
+ path += "/"
199
+
204
200
  if self.glob_include.endswith("**"):
205
201
  return self.allowed(path, ignore_ext=True)
206
202
 
@@ -219,10 +215,8 @@ class PathSpec(ConfigModel):
219
215
  for _ in range(slash_to_remove_from_glob):
220
216
  glob_include = glob_include.rsplit("/", 1)[0]
221
217
 
222
- logger.debug(f"Checking dir to inclusion: {path}")
223
218
  if not pathlib.PurePath(path).globmatch(glob_include, flags=pathlib.GLOBSTAR):
224
219
  return False
225
- logger.debug(f"{path} matched include ")
226
220
  if self.exclude:
227
221
  for exclude_path in self.exclude:
228
222
  if pathlib.PurePath(path.rstrip("/")).globmatch(
@@ -230,13 +224,12 @@ class PathSpec(ConfigModel):
230
224
  ):
231
225
  return False
232
226
 
233
- file_name_pattern = self.include.rsplit("/", 1)[1]
234
227
  table_name, _ = self.extract_table_name_and_path(
235
- os.path.join(path, file_name_pattern)
228
+ path + self.get_remaining_glob_include(path)
236
229
  )
237
230
  if not self.tables_filter_pattern.allowed(table_name):
238
231
  return False
239
- logger.debug(f"{path} is passed table name check")
232
+ # logger.debug(f"{path} is passed table name check")
240
233
 
241
234
  return True
242
235
 
@@ -246,10 +239,10 @@ class PathSpec(ConfigModel):
246
239
  if parsable_include.endswith("/{table}/**"):
247
240
  # Remove the last two characters to make it parsable if it ends with {table}/** which marks autodetect partition
248
241
  parsable_include = parsable_include[:-2]
249
- else:
250
- # Replace all * with {folder[i]} to make it parsable
251
- for i in range(parsable_include.count("*")):
252
- parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
242
+
243
+ # Replace all * with {folder[i]} to make it parsable
244
+ for i in range(parsable_include.count("*")):
245
+ parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
253
246
  return parsable_include
254
247
 
255
248
  def get_named_vars(self, path: str) -> Union[None, parse.Result, parse.Match]:
@@ -267,7 +260,7 @@ class PathSpec(ConfigModel):
267
260
  ) -> Union[None, parse.Result, parse.Match]:
268
261
  return self.compiled_folder_include.parse(path)
269
262
 
270
- @pydantic.root_validator()
263
+ @pydantic.root_validator(skip_on_failure=True)
271
264
  def validate_no_double_stars(cls, values: Dict) -> Dict:
272
265
  if "include" not in values:
273
266
  return values
@@ -330,8 +323,6 @@ class PathSpec(ConfigModel):
330
323
  if "{table}" in values["include"]:
331
324
  v = "{table}"
332
325
  else:
333
- logger.debug(f"include fields: {compiled_include.named_fields}")
334
- logger.debug(f"table_name fields: {parse.compile(v).named_fields}")
335
326
  if not all(
336
327
  x in compiled_include.named_fields
337
328
  for x in parse.compile(v).named_fields
@@ -356,9 +347,7 @@ class PathSpec(ConfigModel):
356
347
  @cached_property
357
348
  def compiled_include(self):
358
349
  parsable_include = PathSpec.get_parsable_include(self.include)
359
- logger.debug(f"parsable_include: {parsable_include}")
360
350
  compiled_include = parse.compile(parsable_include)
361
- logger.debug(f"Setting compiled_include: {compiled_include}")
362
351
  return compiled_include
363
352
 
364
353
  @cached_property
@@ -366,9 +355,8 @@ class PathSpec(ConfigModel):
366
355
  parsable_folder_include = PathSpec.get_parsable_include(self.include).rsplit(
367
356
  "/", 1
368
357
  )[0]
369
- logger.debug(f"parsable_folder_include: {parsable_folder_include}")
370
358
  compiled_folder_include = parse.compile(parsable_folder_include)
371
- logger.debug(f"Setting compiled_folder_include: {compiled_folder_include}")
359
+
372
360
  return compiled_folder_include
373
361
 
374
362
  @cached_property
@@ -376,7 +364,8 @@ class PathSpec(ConfigModel):
376
364
  # Regular expression to find all substrings enclosed in {}
377
365
  pattern = r"\{(.*?)\}"
378
366
  # Find all matches
379
- matches = re.findall(pattern, self.include.split("{table}/")[1])
367
+ split_parts = self.include.split("{table}/")
368
+ matches = re.findall(pattern, split_parts[1]) if len(split_parts) > 1 else []
380
369
  return matches
381
370
 
382
371
  def get_partition_from_path(self, path: str) -> Optional[List[Tuple[str, str]]]:
@@ -467,7 +456,11 @@ class PathSpec(ConfigModel):
467
456
  partition = partition.rsplit("/", 1)[0]
468
457
  for partition_key in partition.split("/"):
469
458
  if partition_key.find("=") != -1:
470
- partition_keys.append(tuple(partition_key.split("=")))
459
+ key_value = partition_key.split(
460
+ "=", 1
461
+ ) # Split into at most 2 parts
462
+ if len(key_value) == 2:
463
+ partition_keys.append((key_value[0], key_value[1]))
471
464
  else:
472
465
  partition_split = partition.rsplit("/", 1)
473
466
  if len(partition_split) == 1:
@@ -487,7 +480,8 @@ class PathSpec(ConfigModel):
487
480
  return glob_include
488
481
 
489
482
  @pydantic.root_validator(skip_on_failure=True)
490
- def validate_path_spec(cls, values: Dict) -> Dict[str, Any]:
483
+ @staticmethod
484
+ def validate_path_spec(values: Dict) -> Dict[str, Any]:
491
485
  # validate that main fields are populated
492
486
  required_fields = ["include", "file_types", "default_extension"]
493
487
  for f in required_fields:
@@ -563,7 +557,7 @@ class PathSpec(ConfigModel):
563
557
  f"{{{template_key}}}", var[key]
564
558
  )
565
559
  else:
566
- partition_format.replace(f"{{{var_key}}}", var)
560
+ partition_format = partition_format.replace(f"{{{var_key}}}", var)
567
561
  return datetime.datetime.strptime(partition_format, datetime_format).replace(
568
562
  tzinfo=datetime.timezone.utc
569
563
  )
@@ -571,7 +565,7 @@ class PathSpec(ConfigModel):
571
565
  def extract_table_name_and_path(self, path: str) -> Tuple[str, str]:
572
566
  parsed_vars = self.get_named_vars(path)
573
567
  if parsed_vars is None or "table" not in parsed_vars.named:
574
- return os.path.basename(path), path
568
+ return os.path.basename(path.removesuffix("/")), path
575
569
  else:
576
570
  include = self.include
577
571
  depth = include.count("/", 0, include.find("{table}"))
@@ -579,3 +573,38 @@ class PathSpec(ConfigModel):
579
573
  "/".join(path.split("/")[:depth]) + "/" + parsed_vars.named["table"]
580
574
  )
581
575
  return self._extract_table_name(parsed_vars.named), table_path
576
+
577
+ def has_correct_number_of_directory_components(self, path: str) -> bool:
578
+ """
579
+ Checks that a given path has the same number of components as the path spec
580
+ has directory components. Useful for checking if a path needs to descend further
581
+ into child directories or if the source can switch into file listing mode. If the
582
+ glob form of the path spec ends in "**", this always returns False.
583
+ """
584
+ if self.glob_include.endswith("**"):
585
+ return False
586
+
587
+ if not path.endswith("/"):
588
+ path += "/"
589
+ path_slash = path.count("/")
590
+ glob_slash = self.glob_include.count("/")
591
+ if path_slash == glob_slash:
592
+ return True
593
+ return False
594
+
595
+ def get_remaining_glob_include(self, path: str) -> str:
596
+ """
597
+ Given a path, return the remaining components of the path spec (if any
598
+ exist) in glob form. If the glob form of the path spec ends in "**", this
599
+ function's return value also always ends in "**", regardless of how
600
+ many components the input path has.
601
+ """
602
+ if not path.endswith("/"):
603
+ path += "/"
604
+ path_slash = path.count("/")
605
+ remainder = "/".join(self.glob_include.split("/")[path_slash:])
606
+ if remainder:
607
+ return remainder
608
+ if self.glob_include.endswith("**"):
609
+ return "**"
610
+ return ""