acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -7,13 +7,15 @@ from typing import Any, Callable, Optional, Tuple, TypeVar
7
7
 
8
8
  import click
9
9
  import humanfriendly
10
- from packaging.version import Version
10
+ from packaging.version import InvalidVersion, Version
11
11
  from pydantic import BaseModel
12
12
 
13
13
  from datahub._version import __version__
14
14
  from datahub.cli.config_utils import load_client_config
15
15
  from datahub.ingestion.graph.client import DataHubGraph
16
+ from datahub.ingestion.graph.config import ClientMode
16
17
  from datahub.utilities.perf_timer import PerfTimer
18
+ from datahub.utilities.server_config_util import RestServiceConfig
17
19
 
18
20
  log = logging.getLogger(__name__)
19
21
 
@@ -26,10 +28,24 @@ class VersionStats(BaseModel, arbitrary_types_allowed=True):
26
28
  release_date: Optional[datetime] = None
27
29
 
28
30
 
31
+ def _safe_version_stats(version_string: str) -> Optional[VersionStats]:
32
+ """
33
+ Safely create a VersionStats object from a version string.
34
+ Returns None if the version string is invalid.
35
+ """
36
+ try:
37
+ return VersionStats(version=Version(version_string), release_date=None)
38
+ except InvalidVersion:
39
+ log.warning(f"Invalid version format received: {version_string!r}")
40
+ return None
41
+
42
+
29
43
  class ServerVersionStats(BaseModel):
30
44
  current: VersionStats
31
45
  latest: Optional[VersionStats] = None
32
46
  current_server_type: Optional[str] = None
47
+ current_server_default_cli_version: Optional[VersionStats] = None
48
+ is_cloud_server: Optional[bool] = None
33
49
 
34
50
 
35
51
  class ClientVersionStats(BaseModel):
@@ -42,7 +58,7 @@ class DataHubVersionStats(BaseModel):
42
58
  client: ClientVersionStats
43
59
 
44
60
 
45
- async def get_client_version_stats():
61
+ async def get_client_version_stats() -> ClientVersionStats:
46
62
  import aiohttp
47
63
 
48
64
  current_version_string = __version__
@@ -50,6 +66,7 @@ async def get_client_version_stats():
50
66
  client_version_stats: ClientVersionStats = ClientVersionStats(
51
67
  current=VersionStats(version=current_version, release_date=None), latest=None
52
68
  )
69
+
53
70
  async with aiohttp.ClientSession() as session:
54
71
  pypi_url = "https://pypi.org/pypi/acryl_datahub/json"
55
72
  async with session.get(pypi_url) as resp:
@@ -109,7 +126,7 @@ async def get_github_stats():
109
126
  return (latest_server_version, latest_server_date)
110
127
 
111
128
 
112
- async def get_server_config(gms_url: str, token: Optional[str]) -> dict:
129
+ async def get_server_config(gms_url: str, token: Optional[str]) -> RestServiceConfig:
113
130
  import aiohttp
114
131
 
115
132
  headers = {
@@ -124,19 +141,22 @@ async def get_server_config(gms_url: str, token: Optional[str]) -> dict:
124
141
  config_endpoint = f"{gms_url}/config"
125
142
  async with session.get(config_endpoint, headers=headers) as dh_response:
126
143
  dh_response_json = await dh_response.json()
127
- return dh_response_json
144
+ return RestServiceConfig(raw_config=dh_response_json)
128
145
 
129
146
 
130
147
  async def get_server_version_stats(
131
148
  server: Optional[DataHubGraph] = None,
132
- ) -> Tuple[Optional[str], Optional[Version], Optional[datetime]]:
149
+ ) -> Tuple[
150
+ Optional[str], Optional[Version], Optional[str], Optional[datetime], Optional[bool]
151
+ ]:
133
152
  import aiohttp
134
153
 
135
- server_config = None
154
+ server_config: Optional[RestServiceConfig] = None
136
155
  if not server:
137
156
  try:
138
157
  # let's get the server from the cli config
139
158
  client_config = load_client_config()
159
+ client_config.client_mode = ClientMode.CLI
140
160
  host = client_config.server
141
161
  token = client_config.token
142
162
  server_config = await get_server_config(host, token)
@@ -148,17 +168,15 @@ async def get_server_version_stats(
148
168
 
149
169
  server_type = None
150
170
  server_version: Optional[Version] = None
171
+ current_server_default_cli_version = None
151
172
  current_server_release_date = None
173
+ is_cloud_server: Optional[bool] = None
152
174
  if server_config:
153
- server_version_string = (
154
- server_config.get("versions", {})
155
- .get("acryldata/datahub", {})
156
- .get("version")
157
- )
158
- commit_hash = (
159
- server_config.get("versions", {}).get("acryldata/datahub", {}).get("commit")
160
- )
161
- server_type = server_config.get("datahub", {}).get("serverType", "unknown")
175
+ server_version_string = server_config.service_version
176
+ commit_hash = server_config.commit_hash
177
+ server_type = server_config.server_type
178
+ current_server_default_cli_version = server_config.default_cli_version
179
+ is_cloud_server = server_config.is_datahub_cloud
162
180
  if server_type == "quickstart" and commit_hash:
163
181
  async with aiohttp.ClientSession(
164
182
  headers={"Accept": "application/vnd.github.v3+json"}
@@ -173,7 +191,13 @@ async def get_server_version_stats(
173
191
  if server_version_string and server_version_string.startswith("v"):
174
192
  server_version = Version(server_version_string[1:])
175
193
 
176
- return (server_type, server_version, current_server_release_date)
194
+ return (
195
+ server_type,
196
+ server_version,
197
+ current_server_default_cli_version,
198
+ current_server_release_date,
199
+ is_cloud_server,
200
+ )
177
201
 
178
202
 
179
203
  def retrieve_version_stats(
@@ -216,7 +240,9 @@ async def _retrieve_version_stats(
216
240
  (
217
241
  current_server_type,
218
242
  current_server_version,
243
+ current_server_default_cli_version,
219
244
  current_server_release_date,
245
+ is_cloud_server,
220
246
  ) = results[2]
221
247
 
222
248
  server_version_stats = None
@@ -225,12 +251,18 @@ async def _retrieve_version_stats(
225
251
  current=VersionStats(
226
252
  version=current_server_version, release_date=current_server_release_date
227
253
  ),
254
+ current_server_default_cli_version=(
255
+ _safe_version_stats(current_server_default_cli_version)
256
+ if current_server_default_cli_version
257
+ else None
258
+ ),
228
259
  latest=(
229
260
  VersionStats(version=last_server_version, release_date=last_server_date)
230
261
  if last_server_version
231
262
  else None
232
263
  ),
233
264
  current_server_type=current_server_type,
265
+ is_cloud_server=is_cloud_server,
234
266
  )
235
267
 
236
268
  if client_version_stats and server_version_stats:
@@ -257,21 +289,14 @@ def valid_client_version(version: Version) -> bool:
257
289
  """Only version strings like 0.4.5 and 0.6.7.8 are valid. 0.8.6.7rc1 is not"""
258
290
  if version.is_prerelease or version.is_postrelease or version.is_devrelease:
259
291
  return False
260
- if version.major == 0 and version.minor in [8, 9, 10, 11]:
261
- return True
262
-
263
- return False
292
+ return True
264
293
 
265
294
 
266
295
  def valid_server_version(version: Version) -> bool:
267
296
  """Only version strings like 0.8.x, 0.9.x or 0.10.x are valid. 0.1.x is not"""
268
297
  if version.is_prerelease or version.is_postrelease or version.is_devrelease:
269
298
  return False
270
-
271
- if version.major == 0 and version.minor in [8, 9, 10]:
272
- return True
273
-
274
- return False
299
+ return True
275
300
 
276
301
 
277
302
  def is_client_server_compatible(client: VersionStats, server: VersionStats) -> int:
@@ -293,6 +318,27 @@ def is_client_server_compatible(client: VersionStats, server: VersionStats) -> i
293
318
  return server.version.micro - client.version.micro
294
319
 
295
320
 
321
+ def is_server_default_cli_ahead(version_stats: DataHubVersionStats) -> bool:
322
+ """
323
+ Check if the server default CLI version is ahead of the current CLI version.
324
+ Returns True if server default CLI is newer and both versions are valid.
325
+ """
326
+ if not version_stats.server.current_server_default_cli_version:
327
+ return False
328
+
329
+ current_cli = version_stats.client.current
330
+ server_default_cli = version_stats.server.current_server_default_cli_version
331
+
332
+ is_valid_client_version = valid_client_version(current_cli.version)
333
+ is_valid_server_version = valid_client_version(server_default_cli.version)
334
+
335
+ if not (is_valid_client_version and is_valid_server_version):
336
+ return False
337
+
338
+ compatibility_result = is_client_server_compatible(current_cli, server_default_cli)
339
+ return compatibility_result > 0
340
+
341
+
296
342
  def _maybe_print_upgrade_message(
297
343
  version_stats: Optional[DataHubVersionStats],
298
344
  ) -> None:
@@ -314,9 +360,15 @@ def _maybe_print_upgrade_message(
314
360
  if version_stats.client.latest
315
361
  else None
316
362
  )
317
- client_server_compat = is_client_server_compatible(
318
- version_stats.client.current, version_stats.server.current
319
- )
363
+ client_server_compat = 0
364
+ # Skip version compatibility checks for cloud servers (serverEnv="cloud")
365
+ # Cloud servers use different versioning schemes between server and CLI
366
+ is_cloud = version_stats.server.is_cloud_server
367
+
368
+ if not is_cloud:
369
+ client_server_compat = is_client_server_compatible(
370
+ version_stats.client.current, version_stats.server.current
371
+ )
320
372
 
321
373
  if latest_release_date and current_release_date:
322
374
  assert version_stats.client.latest
@@ -379,7 +431,8 @@ def _maybe_print_upgrade_message(
379
431
  + click.style(
380
432
  f"➡️ Upgrade via \"pip install 'acryl-datahub=={version_stats.server.current.version}'\"",
381
433
  fg="cyan",
382
- )
434
+ ),
435
+ err=True,
383
436
  )
384
437
  elif client_server_compat == 0 and encourage_cli_upgrade:
385
438
  with contextlib.suppress(Exception):
@@ -389,7 +442,8 @@ def _maybe_print_upgrade_message(
389
442
  + click.style(
390
443
  f"You seem to be running an old version of datahub cli: {current_version} {get_days(current_release_date)}. Latest version is {latest_version} {get_days(latest_release_date)}.\nUpgrade via \"pip install -U 'acryl-datahub'\"",
391
444
  fg="cyan",
392
- )
445
+ ),
446
+ err=True,
393
447
  )
394
448
  elif encourage_quickstart_upgrade:
395
449
  try:
@@ -429,6 +483,8 @@ def check_upgrade_post(
429
483
 
430
484
 
431
485
  def check_upgrade(func: Callable[..., T]) -> Callable[..., T]:
486
+ log.debug(f"Checking upgrade for {func.__module__}.{func.__name__}")
487
+
432
488
  @wraps(func)
433
489
  def async_wrapper(*args: Any, **kwargs: Any) -> Any:
434
490
  with PerfTimer() as timer:
@@ -1,7 +1,6 @@
1
1
  import collections
2
2
  import gzip
3
3
  import logging
4
- import os
5
4
  import pathlib
6
5
  import pickle
7
6
  import shutil
@@ -28,18 +27,18 @@ from typing import (
28
27
  Union,
29
28
  )
30
29
 
30
+ from datahub.configuration.env_vars import get_override_sqlite_version_req
31
31
  from datahub.ingestion.api.closeable import Closeable
32
32
  from datahub.utilities.sentinels import Unset, unset
33
33
 
34
34
  logger: logging.Logger = logging.getLogger(__name__)
35
35
 
36
- OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR = (
37
- os.environ.get("OVERRIDE_SQLITE_VERSION_REQ") or ""
38
- )
39
- OVERRIDE_SQLITE_VERSION_REQUIREMENT = (
40
- OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR
41
- and OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR.lower() != "false"
42
- )
36
+
37
+ def _get_sqlite_version_override() -> bool:
38
+ """Check if SQLite version requirement should be overridden at runtime."""
39
+ override_str = get_override_sqlite_version_req()
40
+ return bool(override_str and override_str.lower() != "false")
41
+
43
42
 
44
43
  _DEFAULT_FILE_NAME = "sqlite.db"
45
44
  _DEFAULT_TABLE_NAME = "data"
@@ -231,7 +230,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
231
230
  # We use the ON CONFLICT clause to implement UPSERTs with sqlite.
232
231
  # This was added in 3.24.0 from 2018-06-04.
233
232
  # See https://www.sqlite.org/lang_conflict.html
234
- if OVERRIDE_SQLITE_VERSION_REQUIREMENT:
233
+ if _get_sqlite_version_override():
235
234
  self._use_sqlite_on_conflict = False
236
235
  else:
237
236
  raise RuntimeError("SQLite version 3.24.0 or later is required")
@@ -250,7 +249,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
250
249
  rowid INTEGER PRIMARY KEY AUTOINCREMENT,
251
250
  key TEXT UNIQUE,
252
251
  value BLOB
253
- {"".join(f", {column_name} BLOB" for column_name in self.extra_columns.keys())}
252
+ {"".join(f", {column_name} BLOB" for column_name in self.extra_columns)}
254
253
  )"""
255
254
  )
256
255
 
@@ -267,7 +266,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
267
266
  if self.indexes_created:
268
267
  return
269
268
  # The key column will automatically be indexed, but we need indexes for the extra columns.
270
- for column_name in self.extra_columns.keys():
269
+ for column_name in self.extra_columns:
271
270
  self._conn.execute(
272
271
  f"CREATE INDEX {self.tablename}_{column_name} ON {self.tablename} ({column_name})"
273
272
  )
@@ -305,12 +304,12 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
305
304
  f"""INSERT INTO {self.tablename} (
306
305
  key,
307
306
  value
308
- {"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
307
+ {"".join(f", {column_name}" for column_name in self.extra_columns)}
309
308
  )
310
309
  VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})
311
310
  ON CONFLICT (key) DO UPDATE SET
312
311
  value = excluded.value
313
- {"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns.keys())}
312
+ {"".join(f", {column_name} = excluded.{column_name}" for column_name in self.extra_columns)}
314
313
  """,
315
314
  items_to_write,
316
315
  )
@@ -321,7 +320,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
321
320
  f"""INSERT INTO {self.tablename} (
322
321
  key,
323
322
  value
324
- {"".join(f", {column_name}" for column_name in self.extra_columns.keys())}
323
+ {"".join(f", {column_name}" for column_name in self.extra_columns)}
325
324
  )
326
325
  VALUES ({", ".join(["?"] * (2 + len(self.extra_columns)))})""",
327
326
  item,
@@ -330,7 +329,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
330
329
  self._conn.execute(
331
330
  f"""UPDATE {self.tablename} SET
332
331
  value = ?
333
- {"".join(f", {column_name} = ?" for column_name in self.extra_columns.keys())}
332
+ {"".join(f", {column_name} = ?" for column_name in self.extra_columns)}
334
333
  WHERE key = ?""",
335
334
  (*item[1:], item[0]),
336
335
  )
@@ -155,7 +155,7 @@ class HiveColumnToAvroConverter:
155
155
 
156
156
  @staticmethod
157
157
  def _parse_basic_datatype_string(s: str) -> Dict[str, object]:
158
- if s in HiveColumnToAvroConverter._PRIVIMITE_HIVE_TYPE_TO_AVRO_TYPE.keys():
158
+ if s in HiveColumnToAvroConverter._PRIVIMITE_HIVE_TYPE_TO_AVRO_TYPE:
159
159
  return {
160
160
  "type": HiveColumnToAvroConverter._PRIVIMITE_HIVE_TYPE_TO_AVRO_TYPE[s],
161
161
  "native_data_type": s,
@@ -218,7 +218,7 @@ class HiveColumnToAvroConverter:
218
218
  buf = ""
219
219
  level = 0
220
220
  for c in s:
221
- if c in HiveColumnToAvroConverter._BRACKETS.keys():
221
+ if c in HiveColumnToAvroConverter._BRACKETS:
222
222
  level += 1
223
223
  buf += c
224
224
  elif c in HiveColumnToAvroConverter._BRACKETS.values():
@@ -32,10 +32,10 @@ def deploy_source_vars(
32
32
  name: Optional[str],
33
33
  config: str,
34
34
  urn: Optional[str],
35
- executor_id: str,
35
+ executor_id: Optional[str],
36
36
  cli_version: Optional[str],
37
37
  schedule: Optional[str],
38
- time_zone: str,
38
+ time_zone: Optional[str],
39
39
  extra_pip: Optional[str],
40
40
  debug: bool = False,
41
41
  ) -> dict:
@@ -1,6 +1,7 @@
1
- import os
2
1
  import sys
3
2
 
3
+ from datahub.configuration.env_vars import get_test_mode
4
+
4
5
 
5
6
  def is_pytest_running() -> bool:
6
- return "pytest" in sys.modules and os.environ.get("DATAHUB_TEST_MODE") == "1"
7
+ return "pytest" in sys.modules and get_test_mode() == "1"
@@ -15,13 +15,13 @@ import collections
15
15
  import contextlib
16
16
  import itertools
17
17
  import logging
18
- import os
19
18
  import pathlib
20
19
  import sys
21
20
  from typing import Deque, Iterator, Optional
22
21
 
23
22
  import click
24
23
 
24
+ from datahub.configuration.env_vars import get_no_color, get_suppress_logging_manager
25
25
  from datahub.utilities.tee_io import TeeIO
26
26
 
27
27
  BASE_LOGGING_FORMAT = (
@@ -38,7 +38,7 @@ IN_MEMORY_LOG_BUFFER_SIZE = 2000 # lines
38
38
  IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH = 2000 # characters
39
39
 
40
40
 
41
- NO_COLOR = os.environ.get("NO_COLOR", False)
41
+ NO_COLOR = get_no_color()
42
42
 
43
43
 
44
44
  def extract_name_from_filename(filename: str, fallback_name: str) -> str:
@@ -161,6 +161,7 @@ class _LogBuffer:
161
161
  self._buffer: Deque[str] = collections.deque(maxlen=maxlen)
162
162
 
163
163
  def write(self, line: str) -> None:
164
+ # We do not expect `line` to have a trailing newline.
164
165
  if len(line) > IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH:
165
166
  line = line[:IN_MEMORY_LOG_BUFFER_MAX_LINE_LENGTH] + "[truncated]"
166
167
 
@@ -178,6 +179,18 @@ class _LogBuffer:
178
179
  return text
179
180
 
180
181
 
182
+ class _ResilientStreamHandler(logging.StreamHandler):
183
+ """StreamHandler that gracefully handles closed streams."""
184
+
185
+ def emit(self, record: logging.LogRecord) -> None:
186
+ try:
187
+ super().emit(record)
188
+ except (ValueError, OSError):
189
+ # Stream was closed (e.g., during pytest teardown)
190
+ # Silently ignore to prevent test failures
191
+ pass
192
+
193
+
181
194
  class _BufferLogHandler(logging.Handler):
182
195
  def __init__(self, storage: _LogBuffer) -> None:
183
196
  super().__init__()
@@ -188,13 +201,23 @@ class _BufferLogHandler(logging.Handler):
188
201
  message = self.format(record)
189
202
  except TypeError as e:
190
203
  message = f"Error formatting log message: {e}\nMessage: {record.msg}, Args: {record.args}"
191
- self._storage.write(message)
204
+
205
+ # For exception stack traces, the message is split over multiple lines,
206
+ # but we store it as a single string. Because we truncate based on line
207
+ # length, it's better for us to split it into multiple lines so that we
208
+ # don't lose any information on deeper stack traces.
209
+ for line in message.split("\n"):
210
+ self._storage.write(line)
192
211
 
193
212
 
194
213
  def _remove_all_handlers(logger: logging.Logger) -> None:
195
214
  for handler in logger.handlers[:]:
196
215
  logger.removeHandler(handler)
197
- handler.close()
216
+ try:
217
+ handler.close()
218
+ except (ValueError, OSError):
219
+ # Handler stream may already be closed (e.g., during pytest teardown)
220
+ pass
198
221
 
199
222
 
200
223
  _log_buffer = _LogBuffer(maxlen=IN_MEMORY_LOG_BUFFER_SIZE)
@@ -212,14 +235,14 @@ _default_formatter = logging.Formatter(BASE_LOGGING_FORMAT)
212
235
  def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[None]:
213
236
  _log_buffer.clear()
214
237
 
215
- if os.environ.get("DATAHUB_SUPPRESS_LOGGING_MANAGER") == "1":
238
+ if get_suppress_logging_manager() == "1":
216
239
  # If we're running in pytest, we don't want to configure logging.
217
240
  yield
218
241
  return
219
242
 
220
243
  with contextlib.ExitStack() as stack:
221
244
  # Create stdout handler.
222
- stream_handler = logging.StreamHandler()
245
+ stream_handler = _ResilientStreamHandler()
223
246
  stream_handler.addFilter(_DatahubLogFilter(debug=debug))
224
247
  stream_handler.setFormatter(_stream_formatter)
225
248
 
@@ -230,7 +253,7 @@ def configure_logging(debug: bool, log_file: Optional[str] = None) -> Iterator[N
230
253
  tee = TeeIO(sys.stdout, file)
231
254
  stack.enter_context(contextlib.redirect_stdout(tee)) # type: ignore
232
255
 
233
- file_handler = logging.StreamHandler(file)
256
+ file_handler = _ResilientStreamHandler(file)
234
257
  file_handler.addFilter(_DatahubLogFilter(debug=True))
235
258
  file_handler.setFormatter(_default_formatter)
236
259
  else:
@@ -83,7 +83,7 @@ class Constants:
83
83
  MATCH = "match"
84
84
  USER_OWNER = "user"
85
85
  GROUP_OWNER = "group"
86
- OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float]
86
+ OPERAND_DATATYPE_SUPPORTED = [int, bool, str, float, list]
87
87
  TAG_PARTITION_KEY = "PARTITION_KEY"
88
88
  TAG_DIST_KEY = "DIST_KEY"
89
89
  TAG_SORT_KEY = "SORT_KEY"
@@ -455,7 +455,34 @@ class OperationProcessor:
455
455
  # function to check if a match clause is satisfied to a value.
456
456
  if not any(
457
457
  isinstance(raw_props_value, t) for t in Constants.OPERAND_DATATYPE_SUPPORTED
458
- ) or not isinstance(raw_props_value, type(match_clause)):
458
+ ):
459
+ return None
460
+
461
+ # Handle list values by checking if any item in the list matches
462
+ if isinstance(raw_props_value, list):
463
+ # For lists, we need to find at least one matching item
464
+ # Return a match with the concatenated values of all matching items
465
+ matching_items = []
466
+ for item in raw_props_value:
467
+ if isinstance(item, str):
468
+ match = re.match(match_clause, item)
469
+ if match:
470
+ matching_items.append(item)
471
+ elif isinstance(match_clause, type(item)):
472
+ match = re.match(str(match_clause), str(item))
473
+ if match:
474
+ matching_items.append(str(item))
475
+
476
+ if matching_items:
477
+ # Create a synthetic match object with all matching items joined
478
+ combined_value = ",".join(matching_items)
479
+ return re.match(
480
+ ".*", combined_value
481
+ ) # Always matches, returns combined value
482
+ return None
483
+
484
+ # Handle scalar values (existing logic)
485
+ elif not isinstance(raw_props_value, type(match_clause)):
459
486
  return None
460
487
  elif isinstance(raw_props_value, str):
461
488
  return re.match(match_clause, raw_props_value)
@@ -1,12 +1,13 @@
1
- import os
2
1
  import pathlib
3
2
  import tempfile
4
3
 
5
4
  import requests
6
5
 
7
- DOCKER_COMPOSE_BASE = os.getenv(
8
- "DOCKER_COMPOSE_BASE",
9
- "https://raw.githubusercontent.com/datahub-project/datahub/master",
6
+ from datahub.configuration.env_vars import get_docker_compose_base
7
+
8
+ DOCKER_COMPOSE_BASE = (
9
+ get_docker_compose_base()
10
+ or "https://raw.githubusercontent.com/datahub-project/datahub/master"
10
11
  )
11
12
  BOOTSTRAP_MCES_FILE = "metadata-ingestion/examples/mce_files/bootstrap_mce.json"
12
13
  BOOTSTRAP_MCES_URL = f"{DOCKER_COMPOSE_BASE}/{BOOTSTRAP_MCES_FILE}"