acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,300 @@
1
+ import logging
2
+ import socket
3
+ import time
4
+ from typing import Iterable, Optional
5
+ from urllib.parse import urlparse
6
+
7
+ import dns.exception
8
+ import dns.resolver
9
+ import requests
10
+
11
+ from datahub.configuration.common import ConfigModel
12
+ from datahub.ingestion.api.common import PipelineContext
13
+ from datahub.ingestion.api.decorators import (
14
+ SupportStatus,
15
+ config_class,
16
+ platform_name,
17
+ support_status,
18
+ )
19
+ from datahub.ingestion.api.source import Source, SourceReport
20
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class DataHubDebugSourceConfig(ConfigModel):
26
+ dns_probe_url: Optional[str] = None
27
+
28
+
29
+ @platform_name("DataHubDebug")
30
+ @config_class(DataHubDebugSourceConfig)
31
+ @support_status(SupportStatus.TESTING)
32
+ class DataHubDebugSource(Source):
33
+ """
34
+ DataHubDebugSource is helper to debug things in executor where ingestion is running.
35
+
36
+ This source can perform the following tasks:
37
+ 1. Network probe of a URL. Different from test connection in sources as that is after source starts.
38
+
39
+ """
40
+
41
+ def __init__(self, ctx: PipelineContext, config: DataHubDebugSourceConfig):
42
+ self.ctx = ctx
43
+ self.config = config
44
+ self.report = SourceReport()
45
+ self.report.event_not_produced_warn = False
46
+
47
+ @classmethod
48
+ def create(cls, config_dict, ctx):
49
+ config = DataHubDebugSourceConfig.parse_obj(config_dict)
50
+ return cls(ctx, config)
51
+
52
+ def perform_dns_probe(self, url: str) -> None:
53
+ """
54
+ Perform comprehensive DNS probe and network connectivity tests.
55
+ Logs detailed information to help diagnose network issues.
56
+ """
57
+ logger.info(f"Starting DNS probe for URL: {url}")
58
+ logger.info("=" * 60)
59
+ logger.info(f"DNS PROBE REPORT FOR: {url}")
60
+ logger.info("=" * 60)
61
+
62
+ try:
63
+ # Parse the URL to extract hostname
64
+ parsed_url = urlparse(
65
+ url if url.startswith(("http://", "https://")) else f"http://{url}"
66
+ )
67
+ hostname = parsed_url.hostname or parsed_url.netloc
68
+ port = parsed_url.port or (443 if parsed_url.scheme == "https" else 80)
69
+
70
+ logger.info(f"Parsed hostname: {hostname}")
71
+ logger.info(f"Target port: {port}")
72
+ logger.info(f"URL scheme: {parsed_url.scheme}")
73
+ logger.info("-" * 60)
74
+
75
+ # Test 1: Enhanced DNS resolution with dnspython if available
76
+ logger.info("1. DNS RESOLUTION TEST")
77
+ self._dns_probe_with_dnspython(hostname)
78
+
79
+ logger.info("-" * 60)
80
+
81
+ # Test 2: HTTP/HTTPS connectivity test with requests if available
82
+ logger.info("2. HTTP CONNECTIVITY TEST")
83
+ self._http_probe_with_requests(url)
84
+
85
+ logger.info("-" * 60)
86
+
87
+ # Test 3: System network information
88
+ logger.info("3. SYSTEM NETWORK INFORMATION")
89
+ self._log_system_network_info()
90
+
91
+ except Exception as e:
92
+ logger.error(f"DNS probe failed with unexpected error: {e}", exc_info=True)
93
+
94
+ logger.info("=" * 60)
95
+ logger.info("DNS PROBE COMPLETED")
96
+ logger.info("=" * 60)
97
+
98
+ def _dns_probe_with_dnspython(self, hostname: str) -> None:
99
+ """Enhanced DNS probing using dnspython library"""
100
+ try:
101
+ # Test different record types
102
+ record_types = ["A", "AAAA", "CNAME", "MX"]
103
+
104
+ for record_type in record_types:
105
+ try:
106
+ start_time = time.time()
107
+ answers = dns.resolver.resolve(hostname, record_type)
108
+ dns_time = time.time() - start_time
109
+
110
+ logger.info(
111
+ f"✓ {record_type} record resolution successful ({dns_time:.3f}s)"
112
+ )
113
+ for answer in answers:
114
+ logger.info(f" - {record_type}: {answer}")
115
+
116
+ except dns.resolver.NXDOMAIN:
117
+ logger.info(f"✗ {record_type} record: Domain does not exist")
118
+ except dns.resolver.NoAnswer:
119
+ logger.info(
120
+ f"- {record_type} record: No answer (record type not available)"
121
+ )
122
+ except dns.exception.Timeout:
123
+ logger.error(f"✗ {record_type} record: DNS query timed out")
124
+ except Exception as e:
125
+ logger.error(f"✗ {record_type} record query failed: {e}")
126
+
127
+ # Test different DNS servers
128
+ logger.info("Testing with different DNS servers:")
129
+ dns_servers = ["8.8.8.8", "1.1.1.1", "208.67.222.222"]
130
+
131
+ for dns_server in dns_servers:
132
+ try:
133
+ resolver = dns.resolver.Resolver()
134
+ resolver.nameservers = [dns_server]
135
+ resolver.timeout = 5
136
+
137
+ start_time = time.time()
138
+ answers = resolver.resolve(hostname, "A")
139
+ dns_time = time.time() - start_time
140
+
141
+ logger.info(
142
+ f"✓ DNS server {dns_server} responded ({dns_time:.3f}s)"
143
+ )
144
+ for answer in answers:
145
+ logger.info(f" - A: {answer}")
146
+
147
+ except Exception as e:
148
+ logger.error(f"✗ DNS server {dns_server} failed: {e}")
149
+
150
+ except Exception as e:
151
+ logger.error(f"Enhanced DNS probe failed: {e}", exc_info=True)
152
+
153
+ def _http_probe_with_requests(self, url: str) -> None:
154
+ """HTTP connectivity test using requests library"""
155
+ try:
156
+ # Test with different timeouts and methods
157
+ timeout = 10
158
+ allow_redirects_head = True
159
+ allow_redirects_get = False
160
+
161
+ # Test HEAD request
162
+ try:
163
+ logger.info(f"Testing HEAD request with timeout {timeout}s")
164
+ start_time = time.time()
165
+
166
+ response = requests.head(
167
+ url, timeout=timeout, allow_redirects=allow_redirects_head
168
+ )
169
+
170
+ request_time = time.time() - start_time
171
+
172
+ logger.info(f"✓ HEAD request successful ({request_time:.3f}s)")
173
+ logger.info(f" Status code: {response.status_code}")
174
+ logger.info(
175
+ f" Response headers: {dict(list(response.headers.items())[:5])}"
176
+ )
177
+
178
+ if hasattr(response, "url") and response.url != url:
179
+ logger.info(f" Final URL after redirects: {response.url}")
180
+
181
+ except requests.exceptions.Timeout:
182
+ logger.error(f"✗ HEAD request timed out after {timeout}s")
183
+ except requests.exceptions.ConnectionError as e:
184
+ logger.error(f"✗ HEAD connection error: {e}")
185
+ except requests.exceptions.RequestException as e:
186
+ logger.error(f"✗ HEAD request failed: {e}")
187
+ except Exception as e:
188
+ logger.error(f"✗ HEAD unexpected error: {e}")
189
+
190
+ # Test GET request
191
+ try:
192
+ logger.info(f"Testing GET request with timeout {timeout}s")
193
+ start_time = time.time()
194
+
195
+ response = requests.get(
196
+ url, timeout=timeout, allow_redirects=allow_redirects_get
197
+ )
198
+
199
+ request_time = time.time() - start_time
200
+
201
+ logger.info(f"✓ GET request successful ({request_time:.3f}s)")
202
+ logger.info(f" Status code: {response.status_code}")
203
+ logger.info(
204
+ f" Response headers: {dict(list(response.headers.items())[:5])}"
205
+ )
206
+
207
+ if hasattr(response, "url") and response.url != url:
208
+ logger.info(f" Final URL after redirects: {response.url}")
209
+
210
+ except requests.exceptions.Timeout:
211
+ logger.error(f"✗ GET request timed out after {timeout}s")
212
+ except requests.exceptions.ConnectionError as e:
213
+ logger.error(f"✗ GET connection error: {e}")
214
+ except requests.exceptions.RequestException as e:
215
+ logger.error(f"✗ GET request failed: {e}")
216
+ except Exception as e:
217
+ logger.error(f"✗ GET unexpected error: {e}")
218
+
219
+ except Exception as e:
220
+ logger.error(f"HTTP probe failed: {e}", exc_info=True)
221
+
222
+ def _log_dns_troubleshooting(self) -> None:
223
+ """Log DNS troubleshooting information"""
224
+ logger.info("DNS TROUBLESHOOTING SUGGESTIONS:")
225
+ logger.info("- Check if the hostname is correct")
226
+ logger.info("- Verify DNS server configuration")
227
+ logger.info("- Check network connectivity")
228
+ logger.info("- Try using a different DNS server (8.8.8.8, 1.1.1.1)")
229
+ logger.info("- Check if there are firewall restrictions")
230
+
231
+ def _log_system_network_info(self) -> None:
232
+ """Log system network configuration information"""
233
+ try:
234
+ local_hostname = socket.gethostname()
235
+ logger.info(f"Local hostname: {local_hostname}")
236
+
237
+ try:
238
+ local_ips = socket.getaddrinfo(local_hostname, None)
239
+ logger.info("Local IP addresses:")
240
+ for addr_info in local_ips:
241
+ if addr_info[0] in [socket.AF_INET, socket.AF_INET6]:
242
+ family = "IPv4" if addr_info[0] == socket.AF_INET else "IPv6"
243
+ logger.info(f" - {addr_info[4][0]} ({family})")
244
+ except Exception as e:
245
+ logger.warning(f"Could not retrieve local IP addresses: {e}")
246
+
247
+ logger.info("DNS Server Connectivity:")
248
+ dns_servers = ["8.8.8.8", "1.1.1.1", "208.67.222.222"]
249
+ for dns_server in dns_servers:
250
+ try:
251
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
252
+ sock.settimeout(5)
253
+ result = sock.connect_ex((dns_server, 53))
254
+ if result == 0:
255
+ logger.info(f" ✓ Can reach {dns_server}:53")
256
+ else:
257
+ logger.error(f" ✗ Cannot reach {dns_server}:53")
258
+ sock.close()
259
+ except Exception as e:
260
+ logger.error(f" ✗ Error testing {dns_server}:53 - {e}")
261
+
262
+ except Exception as e:
263
+ logger.warning(f"Could not gather system network info: {e}")
264
+
265
+ def _test_alternative_dns(self, hostname: str) -> None:
266
+ """Test hostname resolution using alternative methods"""
267
+ try:
268
+ families = [(socket.AF_INET, "IPv4"), (socket.AF_INET6, "IPv6")]
269
+
270
+ for family, family_name in families:
271
+ try:
272
+ result = socket.getaddrinfo(hostname, None, family)
273
+ if result:
274
+ logger.info(f"✓ {family_name} resolution successful:")
275
+ for addr_info in result[:3]:
276
+ logger.info(f" - {addr_info[4][0]}")
277
+ else:
278
+ logger.warning(
279
+ f"✗ {family_name} resolution returned no results"
280
+ )
281
+ except socket.gaierror:
282
+ logger.error(f"✗ {family_name} resolution failed")
283
+ except Exception as e:
284
+ logger.error(f"✗ {family_name} resolution error: {e}")
285
+
286
+ except Exception as e:
287
+ logger.error(f"Alternative DNS test failed: {e}")
288
+
289
+ def get_workunits_internal(
290
+ self,
291
+ ) -> Iterable[MetadataWorkUnit]:
292
+ if self.config.dns_probe_url is not None:
293
+ # Perform DNS probe
294
+ logger.info(f"Performing DNS probe for: {self.config.dns_probe_url}")
295
+ self.perform_dns_probe(self.config.dns_probe_url)
296
+
297
+ yield from []
298
+
299
+ def get_report(self) -> SourceReport:
300
+ return self.report
@@ -13,8 +13,9 @@ from datahub.configuration.source_common import (
13
13
  )
14
14
  from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
15
15
  from datahub.ingestion.source.aws.s3_util import is_s3_uri
16
- from datahub.ingestion.source.state.stateful_ingestion_base import (
16
+ from datahub.ingestion.source.state.stale_entity_removal_handler import (
17
17
  StatefulIngestionConfigBase,
18
+ StatefulStaleMetadataRemovalConfig,
18
19
  )
19
20
 
20
21
  # hide annoying debug errors from py4j
@@ -39,9 +40,7 @@ class S3(ConfigModel):
39
40
 
40
41
 
41
42
  class DeltaLakeSourceConfig(
42
- PlatformInstanceConfigMixin,
43
- EnvConfigMixin,
44
- StatefulIngestionConfigBase,
43
+ PlatformInstanceConfigMixin, EnvConfigMixin, StatefulIngestionConfigBase
45
44
  ):
46
45
  base_path: str = Field(
47
46
  description="Path to table (s3 or local file system). If path is not a delta table path "
@@ -78,7 +77,12 @@ class DeltaLakeSourceConfig(
78
77
  "When set to `False`, number_of_files in delta table can not be reported.",
79
78
  )
80
79
 
81
- s3: Optional[S3] = Field()
80
+ s3: Optional[S3] = Field(None)
81
+
82
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
83
+ default=None,
84
+ description="Stateful Ingestion Config with stale metadata removal",
85
+ )
82
86
 
83
87
  @cached_property
84
88
  def is_s3(self):
@@ -29,6 +29,7 @@ from datahub.ingestion.source.aws.s3_util import (
29
29
  get_key_prefix,
30
30
  strip_s3_prefix,
31
31
  )
32
+ from datahub.ingestion.source.common.subtypes import SourceCapabilityModifier
32
33
  from datahub.ingestion.source.data_lake_common.data_lake_utils import ContainerWUCreator
33
34
  from datahub.ingestion.source.delta_lake.config import DeltaLakeSourceConfig
34
35
  from datahub.ingestion.source.delta_lake.delta_lake_utils import (
@@ -85,6 +86,13 @@ OPERATION_STATEMENT_TYPES = {
85
86
  @config_class(DeltaLakeSourceConfig)
86
87
  @support_status(SupportStatus.INCUBATING)
87
88
  @capability(SourceCapability.TAGS, "Can extract S3 object/bucket tags if enabled")
89
+ @capability(
90
+ SourceCapability.CONTAINERS,
91
+ "Enabled by default",
92
+ subtype_modifier=[
93
+ SourceCapabilityModifier.FOLDER,
94
+ ],
95
+ )
88
96
  class DeltaLakeSource(StatefulIngestionSourceBase):
89
97
  """
90
98
  This plugin extracts:
@@ -7,7 +7,7 @@ from collections import defaultdict
7
7
  from enum import Enum
8
8
  from itertools import product
9
9
  from time import sleep, time
10
- from typing import Any, Deque, Dict, List, Optional, Union
10
+ from typing import TYPE_CHECKING, Any, Deque, Dict, List, Optional, Union
11
11
  from urllib.parse import quote
12
12
 
13
13
  import requests
@@ -15,12 +15,17 @@ from requests.adapters import HTTPAdapter
15
15
  from urllib3 import Retry
16
16
  from urllib3.exceptions import InsecureRequestWarning
17
17
 
18
+ from datahub.emitter.request_helper import make_curl_command
18
19
  from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
19
20
  from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
20
21
  DremioToDataHubSourceTypeMapping,
21
22
  )
22
23
  from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
23
24
  from datahub.ingestion.source.dremio.dremio_sql_queries import DremioSQLQueries
25
+ from datahub.utilities.perf_timer import PerfTimer
26
+
27
+ if TYPE_CHECKING:
28
+ from datahub.ingestion.source.dremio.dremio_entities import DremioContainer
24
29
 
25
30
  logger = logging.getLogger(__name__)
26
31
 
@@ -54,6 +59,8 @@ class DremioAPIOperations:
54
59
  self.deny_schema_pattern: List[str] = connection_args.schema_pattern.deny
55
60
  self._max_workers: int = connection_args.max_workers
56
61
  self.is_dremio_cloud = connection_args.is_dremio_cloud
62
+ self.start_time = connection_args.start_time
63
+ self.end_time = connection_args.end_time
57
64
  self.report = report
58
65
  self.session = requests.Session()
59
66
  if connection_args.is_dremio_cloud:
@@ -178,6 +185,7 @@ class DremioAPIOperations:
178
185
  self.session.headers.update(
179
186
  {"Authorization": f"Bearer {connection_args.password}"}
180
187
  )
188
+ logger.debug("Configured Dremio cloud API session to use PAT")
181
189
  return
182
190
 
183
191
  # On-prem Dremio authentication (PAT or Basic Auth)
@@ -189,6 +197,7 @@ class DremioAPIOperations:
189
197
  "Authorization": f"Bearer {connection_args.password}",
190
198
  }
191
199
  )
200
+ logger.debug("Configured Dremio API session to use PAT")
192
201
  return
193
202
  else:
194
203
  assert connection_args.username and connection_args.password, (
@@ -212,10 +221,10 @@ class DremioAPIOperations:
212
221
  response.raise_for_status()
213
222
  token = response.json().get("token")
214
223
  if token:
224
+ logger.debug("Exchanged username and password for Dremio token")
215
225
  self.session.headers.update(
216
226
  {"Authorization": f"_dremio{token}"}
217
227
  )
218
-
219
228
  return
220
229
  else:
221
230
  self.report.failure("Failed to authenticate", login_url)
@@ -231,49 +240,76 @@ class DremioAPIOperations:
231
240
  "Credentials cannot be refreshed. Please check your username and password."
232
241
  )
233
242
 
243
+ def _request(self, method: str, url: str, data: Union[str, None] = None) -> Dict:
244
+ """Send a request to the Dremio API."""
245
+
246
+ logger.debug(f"{method} request to {self.base_url + url}")
247
+ self.report.api_calls_total += 1
248
+ self.report.api_calls_by_method_and_path[f"{method} {url}"] += 1
249
+
250
+ with PerfTimer() as timer:
251
+ response = self.session.request(
252
+ method=method,
253
+ url=(self.base_url + url),
254
+ data=data,
255
+ verify=self._verify,
256
+ timeout=self._timeout,
257
+ )
258
+ self.report.api_call_secs_by_method_and_path[f"{method} {url}"] += (
259
+ timer.elapsed_seconds()
260
+ )
261
+ # response.raise_for_status() # Enabling this line, makes integration tests to fail
262
+ try:
263
+ return response.json()
264
+ except requests.exceptions.JSONDecodeError as e:
265
+ logger.info(
266
+ f"On {method} request to {url}, failed to parse JSON from response (status {response.status_code}): {response.text}"
267
+ )
268
+ logger.debug(
269
+ f"Request curl equivalent: {make_curl_command(self.session, method, url, data)}"
270
+ )
271
+ raise DremioAPIException(
272
+ f"Failed to parse JSON from response (status {response.status_code}): {response.text}"
273
+ ) from e
274
+
234
275
  def get(self, url: str) -> Dict:
235
- """execute a get request on dremio"""
236
- response = self.session.get(
237
- url=(self.base_url + url),
238
- verify=self._verify,
239
- timeout=self._timeout,
240
- )
241
- return response.json()
276
+ """Send a GET request to the Dremio API."""
277
+ return self._request("GET", url)
242
278
 
243
279
  def post(self, url: str, data: str) -> Dict:
244
- """execute a get request on dremio"""
245
- response = self.session.post(
246
- url=(self.base_url + url),
247
- data=data,
248
- verify=self._verify,
249
- timeout=self._timeout,
250
- )
251
- return response.json()
280
+ """Send a POST request to the Dremio API."""
281
+ return self._request("POST", url, data=data)
252
282
 
253
283
  def execute_query(self, query: str, timeout: int = 3600) -> List[Dict[str, Any]]:
254
284
  """Execute SQL query with timeout and error handling"""
255
285
  try:
256
- response = self.post(url="/sql", data=json.dumps({"sql": query}))
286
+ with PerfTimer() as timer:
287
+ logger.info(f"Executing query: {query}")
288
+ response = self.post(url="/sql", data=json.dumps({"sql": query}))
257
289
 
258
- if "errorMessage" in response:
259
- self.report.failure(
260
- message="SQL Error", context=f"{response['errorMessage']}"
261
- )
262
- raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
290
+ if "errorMessage" in response:
291
+ self.report.failure(
292
+ message="SQL Error", context=f"{response['errorMessage']}"
293
+ )
294
+ raise DremioAPIException(f"SQL Error: {response['errorMessage']}")
263
295
 
264
- job_id = response["id"]
296
+ job_id = response["id"]
265
297
 
266
- with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
267
- future = executor.submit(self.fetch_results, job_id)
268
- try:
269
- return future.result(timeout=timeout)
270
- except concurrent.futures.TimeoutError:
271
- self.cancel_query(job_id)
272
- raise DremioAPIException(
273
- f"Query execution timed out after {timeout} seconds"
274
- ) from None
275
- except RuntimeError as e:
276
- raise DremioAPIException() from e
298
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
299
+ future = executor.submit(self.fetch_results, job_id)
300
+ try:
301
+ result = future.result(timeout=timeout)
302
+ logger.info(
303
+ f"Query executed in {timer.elapsed_seconds()} seconds with {len(result)} results"
304
+ )
305
+ return result
306
+ except concurrent.futures.TimeoutError:
307
+ self.cancel_query(job_id)
308
+ raise DremioAPIException(
309
+ f"Query execution timed out after {timeout} seconds"
310
+ ) from None
311
+ except RuntimeError as e:
312
+ raise DremioAPIException() from e
277
313
 
278
314
  except requests.RequestException as e:
279
315
  raise DremioAPIException("Error executing query") from e
@@ -462,7 +498,9 @@ class DremioAPIOperations:
462
498
  pattern_str = "|".join(f"({p})" for p in patterns)
463
499
  return f"AND {operator}({field}, '{pattern_str}')"
464
500
 
465
- def get_all_tables_and_columns(self, containers: Deque) -> List[Dict]:
501
+ def get_all_tables_and_columns(
502
+ self, containers: Deque["DremioContainer"]
503
+ ) -> List[Dict]:
466
504
  if self.edition == DremioEdition.ENTERPRISE:
467
505
  query_template = DremioSQLQueries.QUERY_DATASETS_EE
468
506
  elif self.edition == DremioEdition.CLOUD:
@@ -603,10 +641,25 @@ class DremioAPIOperations:
603
641
  return parents_list
604
642
 
605
643
  def extract_all_queries(self) -> List[Dict[str, Any]]:
644
+ # Convert datetime objects to string format for SQL queries
645
+ start_timestamp_str = None
646
+ end_timestamp_str = None
647
+
648
+ if self.start_time:
649
+ start_timestamp_str = self.start_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
650
+ if self.end_time:
651
+ end_timestamp_str = self.end_time.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
652
+
606
653
  if self.edition == DremioEdition.CLOUD:
607
- jobs_query = DremioSQLQueries.QUERY_ALL_JOBS_CLOUD
654
+ jobs_query = DremioSQLQueries.get_query_all_jobs_cloud(
655
+ start_timestamp_millis=start_timestamp_str,
656
+ end_timestamp_millis=end_timestamp_str,
657
+ )
608
658
  else:
609
- jobs_query = DremioSQLQueries.QUERY_ALL_JOBS
659
+ jobs_query = DremioSQLQueries.get_query_all_jobs(
660
+ start_timestamp_millis=start_timestamp_str,
661
+ end_timestamp_millis=end_timestamp_str,
662
+ )
610
663
 
611
664
  return self.execute_query(query=jobs_query)
612
665
 
@@ -685,6 +738,27 @@ class DremioAPIOperations:
685
738
 
686
739
  return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
687
740
 
741
+ def _could_match_pattern(self, pattern: str, path_components: List[str]) -> bool:
742
+ """
743
+ Check if a container path could potentially match a schema pattern.
744
+ This handles hierarchical path matching for container filtering.
745
+ """
746
+ if pattern == ".*":
747
+ return True
748
+
749
+ current_path = ".".join(path_components)
750
+
751
+ # Handle simple .* patterns (like "a.b.c.*")
752
+ if pattern.endswith(".*") and not any(c in pattern for c in "^$[](){}+?\\"):
753
+ # Simple dotstar pattern - check prefix matching
754
+ pattern_prefix = pattern[:-2] # Remove ".*"
755
+ return current_path.lower().startswith(
756
+ pattern_prefix.lower()
757
+ ) or pattern_prefix.lower().startswith(current_path.lower())
758
+ else:
759
+ # Complex regex pattern - use existing regex matching logic
760
+ return self._check_pattern_match(pattern, [current_path], allow_prefix=True)
761
+
688
762
  def should_include_container(self, path: List[str], name: str) -> bool:
689
763
  """
690
764
  Helper method to check if a container should be included based on schema patterns.
@@ -711,41 +785,8 @@ class DremioAPIOperations:
711
785
 
712
786
  # Check allow patterns
713
787
  for pattern in self.allow_schema_pattern:
714
- # For patterns with wildcards, check if this path is a parent of the pattern
715
- if "*" in pattern:
716
- pattern_parts = pattern.split(".")
717
- path_parts = path_components
718
-
719
- # If pattern has exact same number of parts, check each component
720
- if len(pattern_parts) == len(path_parts):
721
- matches = True
722
- for p_part, c_part in zip(pattern_parts, path_parts):
723
- if p_part != "*" and p_part.lower() != c_part.lower():
724
- matches = False
725
- break
726
- if matches:
727
- self.report.report_container_scanned(full_path)
728
- return True
729
- # Otherwise check if current path is prefix match
730
- else:
731
- # Remove the trailing wildcard if present
732
- if pattern_parts[-1] == "*":
733
- pattern_parts = pattern_parts[:-1]
734
-
735
- for i in range(len(path_parts)):
736
- current_path = ".".join(path_parts[: i + 1])
737
- pattern_prefix = ".".join(pattern_parts[: i + 1])
738
-
739
- if pattern_prefix.startswith(current_path):
740
- self.report.report_container_scanned(full_path)
741
- return True
742
-
743
- # Direct pattern matching
744
- if self._check_pattern_match(
745
- pattern=pattern,
746
- paths=[full_path],
747
- allow_prefix=True,
748
- ):
788
+ # Check if current path could potentially match this pattern
789
+ if self._could_match_pattern(pattern, path_components):
749
790
  self.report.report_container_scanned(full_path)
750
791
  return True
751
792
 
@@ -14,6 +14,7 @@ from datahub.emitter.mce_builder import (
14
14
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
15
15
  from datahub.emitter.mcp_builder import ContainerKey
16
16
  from datahub.ingestion.api.workunit import MetadataWorkUnit
17
+ from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes
17
18
  from datahub.ingestion.source.dremio.dremio_entities import (
18
19
  DremioContainer,
19
20
  DremioDataset,
@@ -364,9 +365,9 @@ class DremioAspects:
364
365
  ) -> Optional[BrowsePathsV2Class]:
365
366
  paths = []
366
367
 
367
- if entity.subclass == "Dremio Space":
368
+ if entity.subclass == DatasetContainerSubTypes.DREMIO_SPACE.value:
368
369
  paths.append(BrowsePathEntryClass(id="Spaces"))
369
- elif entity.subclass == "Dremio Source":
370
+ elif entity.subclass == DatasetContainerSubTypes.DREMIO_SOURCE.value:
370
371
  paths.append(BrowsePathEntryClass(id="Sources"))
371
372
  if paths:
372
373
  return BrowsePathsV2Class(path=paths)