acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -4,13 +4,15 @@ from enum import Enum
4
4
  from typing import Dict, List, Literal, Optional, Union
5
5
 
6
6
  import pydantic
7
- from pydantic import validator
8
- from pydantic.class_validators import root_validator
7
+ from pydantic import root_validator, validator
9
8
 
10
9
  import datahub.emitter.mce_builder as builder
11
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
12
11
  from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
13
12
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
13
+ from datahub.ingestion.api.incremental_lineage_helper import (
14
+ IncrementalLineageConfigMixin,
15
+ )
14
16
  from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
15
17
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
16
18
  StaleEntityRemovalSourceReport,
@@ -19,6 +21,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
19
21
  from datahub.ingestion.source.state.stateful_ingestion_base import (
20
22
  StatefulIngestionConfigBase,
21
23
  )
24
+ from datahub.utilities.global_warning_util import add_global_warning
22
25
  from datahub.utilities.lossy_collections import LossyList
23
26
  from datahub.utilities.perf_timer import PerfTimer
24
27
 
@@ -183,6 +186,16 @@ class SupportedDataPlatform(Enum):
183
186
  datahub_data_platform_name="databricks",
184
187
  )
185
188
 
189
+ MYSQL = DataPlatformPair(
190
+ powerbi_data_platform_name="MySQL",
191
+ datahub_data_platform_name="mysql",
192
+ )
193
+
194
+ ODBC = DataPlatformPair(
195
+ powerbi_data_platform_name="Odbc",
196
+ datahub_data_platform_name="odbc",
197
+ )
198
+
186
199
 
187
200
  @dataclass
188
201
  class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
@@ -275,44 +288,47 @@ class PowerBiProfilingConfig(ConfigModel):
275
288
 
276
289
 
277
290
  class PowerBiDashboardSourceConfig(
278
- StatefulIngestionConfigBase, DatasetSourceConfigMixin
291
+ StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
279
292
  ):
280
- platform_name: str = pydantic.Field(
281
- default=Constant.PLATFORM_NAME, hidden_from_docs=True
282
- )
293
+ platform_name: HiddenFromDocs[str] = pydantic.Field(default=Constant.PLATFORM_NAME)
283
294
 
284
- platform_urn: str = pydantic.Field(
295
+ platform_urn: HiddenFromDocs[str] = pydantic.Field(
285
296
  default=builder.make_data_platform_urn(platform=Constant.PLATFORM_NAME),
286
- hidden_from_docs=True,
287
297
  )
288
298
 
289
299
  # Organization Identifier
290
300
  tenant_id: str = pydantic.Field(description="PowerBI tenant identifier")
291
301
  # PowerBi workspace identifier
292
- workspace_id: Optional[str] = pydantic.Field(
302
+ workspace_id: HiddenFromDocs[Optional[str]] = pydantic.Field(
293
303
  default=None,
294
304
  description="[deprecated] Use workspace_id_pattern instead",
295
- hidden_from_docs=True,
296
305
  )
297
306
  # PowerBi workspace identifier
298
307
  workspace_id_pattern: AllowDenyPattern = pydantic.Field(
299
308
  default=AllowDenyPattern.allow_all(),
300
- description="Regex patterns to filter PowerBI workspaces in ingestion."
309
+ description="Regex patterns to filter PowerBI workspaces in ingestion by ID."
310
+ " By default all IDs are allowed unless they are filtered by name using 'workspace_name_pattern'."
311
+ " Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
312
+ )
313
+ # PowerBi workspace name
314
+ workspace_name_pattern: AllowDenyPattern = pydantic.Field(
315
+ default=AllowDenyPattern.allow_all(),
316
+ description="Regex patterns to filter PowerBI workspaces in ingestion by name."
317
+ " By default all names are allowed unless they are filtered by ID using 'workspace_id_pattern'."
301
318
  " Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
302
319
  )
303
320
 
304
321
  # Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
305
322
  # DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
306
323
  # mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
307
- dataset_type_mapping: Union[Dict[str, str], Dict[str, PlatformDetail]] = (
308
- pydantic.Field(
309
- default_factory=default_for_dataset_type_mapping,
310
- description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
311
- "DataHub supported datasources."
312
- "You can configured platform instance for dataset lineage. "
313
- "See Quickstart Recipe for mapping",
314
- hidden_from_docs=True,
315
- )
324
+ dataset_type_mapping: HiddenFromDocs[
325
+ Union[Dict[str, str], Dict[str, PlatformDetail]]
326
+ ] = pydantic.Field(
327
+ default_factory=default_for_dataset_type_mapping,
328
+ description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
329
+ "DataHub supported datasources."
330
+ "You can configured platform instance for dataset lineage. "
331
+ "See Quickstart Recipe for mapping",
316
332
  )
317
333
  # PowerBI datasource's server to platform instance mapping
318
334
  server_to_platform_instance: Dict[
@@ -324,6 +340,26 @@ class PowerBiDashboardSourceConfig(
324
340
  "For Google BigQuery the datasource's server is google bigquery project name. "
325
341
  "For Databricks Unity Catalog the datasource's server is workspace FQDN.",
326
342
  )
343
+ # ODBC DSN to platform mapping
344
+ dsn_to_platform_name: Dict[str, str] = pydantic.Field(
345
+ default={},
346
+ description="A mapping of ODBC DSN to DataHub data platform name. "
347
+ "For example with an ODBC connection string 'DSN=database' where the database type "
348
+ "is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
349
+ )
350
+ # ODBC DSN to database (or database.schema) mapping
351
+ dsn_to_database_schema: Dict[str, str] = pydantic.Field(
352
+ default={},
353
+ description="A mapping of ODBC DSN to database names with optional schema names "
354
+ "(some database platforms such a MySQL use the table name pattern 'database.table', "
355
+ "while others use the pattern 'database.schema.table'). "
356
+ "This mapping is used in conjunction with ODBC SQL query parsing. "
357
+ "If SQL queries used with ODBC do not reference fully qualified tables names, "
358
+ "then you should configure mappings for your DSNs. "
359
+ "For example with an ODBC connection string 'DSN=database' where the database "
360
+ "is 'prod' you would configure the mapping as 'database: prod'. "
361
+ "If the database is 'prod' and the schema is 'data' then mapping would be 'database: prod.data'.",
362
+ )
327
363
  # deprecated warning
328
364
  _dataset_type_mapping = pydantic_field_deprecated(
329
365
  "dataset_type_mapping",
@@ -373,8 +409,9 @@ class PowerBiDashboardSourceConfig(
373
409
  )
374
410
  # Enable/Disable extracting dataset schema
375
411
  extract_dataset_schema: bool = pydantic.Field(
376
- default=False,
377
- description="Whether to ingest PBI Dataset Table columns and measures",
412
+ default=True,
413
+ description="Whether to ingest PBI Dataset Table columns and measures."
414
+ " Note: this setting must be `true` for schema extraction and column lineage to be enabled.",
378
415
  )
379
416
  # Enable/Disable extracting lineage information of PowerBI Dataset
380
417
  extract_lineage: bool = pydantic.Field(
@@ -483,7 +520,7 @@ class PowerBiDashboardSourceConfig(
483
520
  include_workspace_name_in_dataset_urn: bool = pydantic.Field(
484
521
  default=False,
485
522
  description="It is recommended to set this to true, as it helps prevent the overwriting of datasets."
486
- "Read section #11560 at https://datahubproject.io/docs/how/updating-datahub/ before enabling this option."
523
+ "Read section #11560 at https://docs.datahub.com/docs/how/updating-datahub/ before enabling this option."
487
524
  "To maintain backward compatibility, this is set to False.",
488
525
  )
489
526
 
@@ -498,10 +535,9 @@ class PowerBiDashboardSourceConfig(
498
535
  "Increase this value if you encounter the 'M-Query Parsing Timeout' message in the connector report.",
499
536
  )
500
537
 
501
- metadata_api_timeout: int = pydantic.Field(
538
+ metadata_api_timeout: HiddenFromDocs[int] = pydantic.Field(
502
539
  default=30,
503
540
  description="timeout in seconds for Metadata Rest Api.",
504
- hidden_from_docs=True,
505
541
  )
506
542
 
507
543
  @root_validator(skip_on_failure=True)
@@ -510,6 +546,7 @@ class PowerBiDashboardSourceConfig(
510
546
  "native_query_parsing",
511
547
  "enable_advance_lineage_sql_construct",
512
548
  "extract_lineage",
549
+ "extract_dataset_schema",
513
550
  ]
514
551
 
515
552
  if (
@@ -536,7 +573,7 @@ class PowerBiDashboardSourceConfig(
536
573
  def map_data_platform(cls, value):
537
574
  # For backward compatibility convert input PostgreSql to PostgreSQL
538
575
  # PostgreSQL is name of the data-platform in M-Query
539
- if "PostgreSql" in value.keys():
576
+ if "PostgreSql" in value:
540
577
  platform_name = value["PostgreSql"]
541
578
  del value["PostgreSql"]
542
579
  value["PostgreSQL"] = platform_name
@@ -575,3 +612,31 @@ class PowerBiDashboardSourceConfig(
575
612
  )
576
613
 
577
614
  return values
615
+
616
+ @root_validator(skip_on_failure=True)
617
+ def validate_extract_dataset_schema(cls, values: Dict) -> Dict:
618
+ if values.get("extract_dataset_schema") is False:
619
+ add_global_warning(
620
+ "Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
621
+ )
622
+ return values
623
+
624
+ @root_validator(skip_on_failure=True)
625
+ def validate_dsn_to_database_schema(cls, values: Dict) -> Dict:
626
+ if values.get("dsn_to_database_schema") is not None:
627
+ dsn_mapping = values.get("dsn_to_database_schema")
628
+ if not isinstance(dsn_mapping, dict):
629
+ raise ValueError("dsn_to_database_schema must contain key-value pairs")
630
+
631
+ for _key, value in dsn_mapping.items():
632
+ if not isinstance(value, str):
633
+ raise ValueError(
634
+ "dsn_to_database_schema mapping values must be strings"
635
+ )
636
+ parts = value.split(".")
637
+ if len(parts) != 1 and len(parts) != 2:
638
+ raise ValueError(
639
+ f"dsn_to_database_schema invalid mapping value: {value}"
640
+ )
641
+
642
+ return values
@@ -74,3 +74,6 @@ class FunctionName(Enum):
74
74
  GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database"
75
75
  AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
76
76
  DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
77
+ MYSQL_DATA_ACCESS = "MySQL.Database"
78
+ ODBC_DATA_ACCESS = "Odbc.DataSource"
79
+ ODBC_QUERY = "Odbc.Query"
@@ -0,0 +1,185 @@
1
+ import re
2
+ from typing import Optional, Tuple, Union
3
+
4
+ server_patterns = [
5
+ r"Server=([^:]+)[:][0-9]+/.*",
6
+ r"SERVER=\{([^}]*)\}",
7
+ r"SERVER=([^;]*)",
8
+ r"HOST=\{([^}]*)\}",
9
+ r"HOST=([^;]*)",
10
+ r"DATA SOURCE=\{([^}]*)\}",
11
+ r"DATA SOURCE=([^;]*)",
12
+ r"DSN=\{([^}]*)\}",
13
+ r"DSN=([^;]*)",
14
+ r"Server=([^;]*)",
15
+ r"S3OutputLocation=([^;]*)",
16
+ r"HTTPPath=([^;]*)",
17
+ r"Host=([^;]*)",
18
+ ]
19
+
20
+ dsn_patterns = [
21
+ r"DSN\s*=\s*\"([^\"]+)\"",
22
+ r"DSN\s*=\s*\'([^\']+)\'",
23
+ r"DSN\s*=\s*([^;]+)",
24
+ ]
25
+
26
+ platform_patterns = {
27
+ "mysql": r"mysql",
28
+ "postgres": r"post(gre(s|sql)?|gres)",
29
+ "mssql": r"(sql\s*server|mssql|sqlncli)",
30
+ "oracle": r"oracle",
31
+ "db2": r"db2",
32
+ "sqlite": r"sqlite",
33
+ "access": r"(access|\.mdb|\.accdb)",
34
+ "excel": r"(excel|\.xls)",
35
+ "firebird": r"firebird",
36
+ "informix": r"informix",
37
+ "sybase": r"sybase",
38
+ "teradata": r"teradata",
39
+ "hadoop": r"(hadoop|hive)",
40
+ "snowflake": r"snowflake",
41
+ "redshift": r"redshift",
42
+ "bigquery": r"bigquery",
43
+ "athena": r"(athena|aws\s*athena)",
44
+ "databricks": r"(databricks|spark)",
45
+ }
46
+
47
+ powerbi_platform_names = {
48
+ "mysql": "MySQL",
49
+ "postgres": "PostgreSQL",
50
+ "mssql": "SQL Server",
51
+ "oracle": "Oracle",
52
+ "db2": "IBM DB2",
53
+ "sqlite": "SQLite",
54
+ "access": "Microsoft Access",
55
+ "excel": "Microsoft Excel",
56
+ "firebird": "Firebird",
57
+ "informix": "IBM Informix",
58
+ "sybase": "SAP Sybase",
59
+ "teradata": "Teradata",
60
+ "hadoop": "Hadoop",
61
+ "snowflake": "Snowflake",
62
+ "redshift": "Amazon Redshift",
63
+ "bigquery": "Google BigQuery",
64
+ "athena": "Amazon Athena",
65
+ "databricks": "Databricks",
66
+ }
67
+
68
+
69
+ def extract_driver(connection_string: str) -> Union[str, None]:
70
+ """
71
+ Parse an ODBC connection string and extract the driver name.
72
+ Handles whitespace in driver names and various connection string formats.
73
+
74
+ Args:
75
+ connection_string (str): The ODBC connection string
76
+
77
+ Returns:
78
+ str: The extracted driver name, or None if not found
79
+ """
80
+ # Match DRIVER={driver name} pattern
81
+ driver_match = re.search(r"DRIVER=\{([^}]*)}", connection_string, re.IGNORECASE)
82
+
83
+ if driver_match:
84
+ return driver_match.group(1).strip()
85
+
86
+ # Alternative pattern for DRIVER=driver
87
+ driver_match = re.search(r"DRIVER=([^;]*)", connection_string, re.IGNORECASE)
88
+
89
+ if driver_match:
90
+ return driver_match.group(1).strip()
91
+
92
+ return None
93
+
94
+
95
+ def extract_dsn(connection_string: str) -> Union[str, None]:
96
+ """
97
+ Extract the DSN value from an ODBC connection string.
98
+
99
+ Args:
100
+ connection_string (str): The ODBC connection string
101
+
102
+ Returns:
103
+ str or None: The extracted DSN value, or None if not found
104
+ """
105
+ for pattern in dsn_patterns:
106
+ match = re.search(pattern, connection_string, re.IGNORECASE)
107
+ if match:
108
+ return match.group(1).strip()
109
+
110
+ return None
111
+
112
+
113
+ def extract_server(connection_string: str) -> Union[str, None]:
114
+ """
115
+ Parse an ODBC connection string and extract the server name.
116
+ Handles various parameter names for server (SERVER, Host, Data Source, etc.)
117
+
118
+ Args:
119
+ connection_string (str): The ODBC connection string
120
+
121
+ Returns:
122
+ str: The extracted server name, or None if not found
123
+ """
124
+ for pattern in server_patterns:
125
+ server_match = re.search(pattern, connection_string, re.IGNORECASE)
126
+ if server_match:
127
+ return server_match.group(1).strip()
128
+
129
+ # Special case for Athena: extract from AwsRegion if no server found
130
+ region_match = re.search(r"AwsRegion=([^;]*)", connection_string, re.IGNORECASE)
131
+ if region_match:
132
+ return f"aws-athena-{region_match.group(1).strip()}"
133
+
134
+ # Special case for Databricks: try to extract hostname from JDBC URL
135
+ jdbc_match = re.search(r"jdbc:spark://([^:;/]+)", connection_string, re.IGNORECASE)
136
+ if jdbc_match:
137
+ return jdbc_match.group(1).strip()
138
+
139
+ return None
140
+
141
+
142
+ def extract_platform(connection_string: str) -> Tuple[Optional[str], Optional[str]]:
143
+ """
144
+ Extract the database platform name from the ODBC driver name.
145
+ Returns the lowercase platform name.
146
+
147
+ Args:
148
+ connection_string (str): The ODBC connection string
149
+
150
+ Returns:
151
+ tuple: A tuple containing the normalized platform name and the corresponding
152
+ Power BI platform name, or None if not recognized.
153
+ """
154
+ driver_name = extract_driver(connection_string)
155
+ if not driver_name:
156
+ return None, None
157
+
158
+ driver_lower = driver_name.lower()
159
+
160
+ for platform, pattern in platform_patterns.items():
161
+ if re.search(pattern, driver_lower):
162
+ return platform, powerbi_platform_names.get(platform)
163
+
164
+ return None, None
165
+
166
+
167
+ def normalize_platform_name(platform: str) -> Tuple[Optional[str], Optional[str]]:
168
+ """
169
+ Normalizes the platform name by matching it with predefined patterns and maps it to
170
+ a corresponding Power BI platform name.
171
+
172
+ Args:
173
+ platform (str): The platform name to normalize
174
+
175
+ Returns:
176
+ tuple: A tuple containing the normalized platform name and the corresponding
177
+ Power BI platform name, or None if not recognized.
178
+ """
179
+ platform_lower = platform.lower()
180
+
181
+ for platform, pattern in platform_patterns.items():
182
+ if re.search(pattern, platform_lower):
183
+ return platform, powerbi_platform_names.get(platform)
184
+
185
+ return None, None
@@ -1,13 +1,13 @@
1
1
  import functools
2
2
  import importlib.resources as pkg_resource
3
3
  import logging
4
- import os
5
4
  from typing import Dict, List, Optional
6
5
 
7
6
  import lark
8
7
  from lark import Lark, Tree
9
8
 
10
9
  import datahub.ingestion.source.powerbi.m_query.data_classes
10
+ from datahub.configuration.env_vars import get_powerbi_m_query_parse_timeout
11
11
  from datahub.ingestion.api.common import PipelineContext
12
12
  from datahub.ingestion.source.powerbi.config import (
13
13
  PowerBiDashboardSourceConfig,
@@ -25,7 +25,7 @@ from datahub.utilities.threading_timeout import TimeoutException, threading_time
25
25
 
26
26
  logger = logging.getLogger(__name__)
27
27
 
28
- _M_QUERY_PARSE_TIMEOUT = int(os.getenv("DATAHUB_POWERBI_M_QUERY_PARSE_TIMEOUT", 60))
28
+ _M_QUERY_PARSE_TIMEOUT = get_powerbi_m_query_parse_timeout()
29
29
 
30
30
 
31
31
  @functools.lru_cache(maxsize=1)