acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -89,7 +89,7 @@ class RedshiftCommonQuery:
89
89
  ) -> str:
90
90
  # NOTE: it looks like description is available only in pg_description
91
91
  # So this remains preferrred way
92
- tables_query = """
92
+ tables_query = f"""
93
93
  SELECT CASE c.relkind
94
94
  WHEN 'r' THEN 'TABLE'
95
95
  WHEN 'v' THEN 'VIEW'
@@ -120,6 +120,7 @@ class RedshiftCommonQuery:
120
120
  LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
121
121
  LEFT JOIN pg_class_info as ci on c.oid = ci.reloid
122
122
  LEFT JOIN pg_catalog.pg_description pgd ON pgd.objsubid = 0 AND pgd.objoid = c.oid
123
+ JOIN svv_redshift_schemas rs ON rs.schema_name = n.nspname AND rs.database_name = '{database}'
123
124
  WHERE c.relkind IN ('r','v','m','S','f')
124
125
  AND n.nspname !~ '^pg_'
125
126
  AND n.nspname != 'information_schema'
@@ -128,23 +129,24 @@ class RedshiftCommonQuery:
128
129
  external_tables_query = f"""
129
130
  SELECT 'EXTERNAL_TABLE' as tabletype,
130
131
  NULL AS "schema_oid",
131
- schemaname AS "schema",
132
+ t.schemaname AS "schema",
132
133
  NULL AS "rel_oid",
133
- tablename AS "relname",
134
+ t.tablename AS "relname",
134
135
  NULL as "creation_time",
135
136
  NULL AS "diststyle",
136
137
  NULL AS "owner_id",
137
138
  NULL AS "owner_name",
138
139
  NULL AS "view_definition",
139
140
  NULL AS "privileges",
140
- "location",
141
- parameters,
142
- input_format,
143
- output_format,
144
- serde_parameters,
141
+ t."location",
142
+ t.parameters,
143
+ t.input_format,
144
+ t.output_format,
145
+ t.serde_parameters,
145
146
  NULL as table_description
146
- FROM pg_catalog.svv_external_tables
147
- WHERE redshift_database_name='{database}'
147
+ FROM pg_catalog.svv_external_tables t
148
+ JOIN SVV_EXTERNAL_SCHEMAS s ON t.schemaname = s.schemaname
149
+ WHERE t.redshift_database_name='{database}'
148
150
  ORDER BY "schema",
149
151
  "relname"
150
152
  """
@@ -232,11 +234,12 @@ class RedshiftCommonQuery:
232
234
  ON att.attrelid = c.oid
233
235
  LEFT JOIN pg_catalog.pg_attrdef ad
234
236
  ON (att.attrelid, att.attnum) = (ad.adrelid, ad.adnum)
237
+ JOIN svv_redshift_schemas rs ON rs.schema_name = n.nspname AND rs.database_name = '{database_name}'
235
238
  WHERE n.nspname !~ '^pg_'
236
239
  AND n.nspname != 'information_schema'
237
240
  AND att.attnum > 0
238
241
  AND NOT att.attisdropped
239
- and schema = '{schema_name}'
242
+ and n.nspname = '{schema_name}'
240
243
  UNION
241
244
  SELECT
242
245
  view_schema as "schema",
@@ -263,26 +266,27 @@ class RedshiftCommonQuery:
263
266
  WHERE 1 and schema = '{schema_name}'
264
267
  UNION
265
268
  SELECT
266
- schemaname as "schema",
267
- tablename as "table_name",
268
- columnname as "name",
269
+ c.schemaname as "schema",
270
+ c.tablename as "table_name",
271
+ c.columnname as "name",
269
272
  null as "encode",
270
273
  -- Spectrum represents data types differently.
271
274
  -- Standardize, so we can infer types.
272
- external_type AS "type",
275
+ c.external_type AS "type",
273
276
  null as "distkey",
274
277
  0 as "sortkey",
275
278
  null as "notnull",
276
279
  null as "comment",
277
280
  null as "adsrc",
278
281
  null as "attnum",
279
- external_type AS "format_type",
282
+ c.external_type AS "format_type",
280
283
  null as "default",
281
284
  null as "schema_oid",
282
285
  null as "table_oid"
283
- FROM SVV_EXTERNAL_COLUMNS
284
- WHERE 1 and schema = '{schema_name}'
285
- AND redshift_database_name = '{database_name}'
286
+ FROM SVV_EXTERNAL_COLUMNS c
287
+ JOIN SVV_EXTERNAL_SCHEMAS s ON c.schemaname = s.schemaname
288
+ WHERE c.schemaname = '{schema_name}'
289
+ AND c.redshift_database_name = '{database_name}'
286
290
  ORDER BY "schema", "table_name", "attnum"
287
291
  """
288
292
 
@@ -1,5 +1,4 @@
1
1
  import functools
2
- import itertools
3
2
  import logging
4
3
  from collections import defaultdict
5
4
  from typing import Dict, Iterable, List, Optional, Type, Union
@@ -10,6 +9,7 @@ import humanfriendly
10
9
  import pydantic
11
10
  import redshift_connector
12
11
 
12
+ from datahub.configuration.common import AllowDenyPattern
13
13
  from datahub.configuration.pattern_utils import is_schema_allowed
14
14
  from datahub.emitter.mce_builder import (
15
15
  make_data_platform_urn,
@@ -46,12 +46,12 @@ from datahub.ingestion.source.common.data_reader import DataReader
46
46
  from datahub.ingestion.source.common.subtypes import (
47
47
  DatasetContainerSubTypes,
48
48
  DatasetSubTypes,
49
+ SourceCapabilityModifier,
49
50
  )
50
51
  from datahub.ingestion.source.redshift.config import RedshiftConfig
51
52
  from datahub.ingestion.source.redshift.datashares import RedshiftDatasharesHelper
52
53
  from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield
53
- from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor
54
- from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2
54
+ from datahub.ingestion.source.redshift.lineage import RedshiftSqlLineage
55
55
  from datahub.ingestion.source.redshift.profile import RedshiftProfiler
56
56
  from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader
57
57
  from datahub.ingestion.source.redshift.redshift_schema import (
@@ -70,7 +70,6 @@ from datahub.ingestion.source.sql.sql_utils import (
70
70
  add_table_to_schema_container,
71
71
  gen_database_container,
72
72
  gen_database_key,
73
- gen_lineage,
74
73
  gen_schema_container,
75
74
  gen_schema_key,
76
75
  get_dataplatform_instance_aspect,
@@ -90,8 +89,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
90
89
  from datahub.ingestion.source_report.ingestion_stage import (
91
90
  LINEAGE_EXTRACTION,
92
91
  METADATA_EXTRACTION,
93
- PROFILING,
94
92
  USAGE_EXTRACTION_INGESTION,
93
+ IngestionHighStage,
95
94
  )
96
95
  from datahub.metadata.com.linkedin.pegasus2avro.common import SubTypes, TimeStamp
97
96
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
@@ -114,7 +113,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
114
113
  )
115
114
  from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass
116
115
  from datahub.utilities import memory_footprint
117
- from datahub.utilities.dedup_list import deduplicate_list
118
116
  from datahub.utilities.mapping import Constants
119
117
  from datahub.utilities.perf_timer import PerfTimer
120
118
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -125,7 +123,14 @@ logger: logging.Logger = logging.getLogger(__name__)
125
123
  @platform_name("Redshift")
126
124
  @config_class(RedshiftConfig)
127
125
  @support_status(SupportStatus.CERTIFIED)
128
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
126
+ @capability(
127
+ SourceCapability.CONTAINERS,
128
+ "Enabled by default",
129
+ subtype_modifier=[
130
+ SourceCapabilityModifier.DATABASE,
131
+ SourceCapabilityModifier.SCHEMA,
132
+ ],
133
+ )
129
134
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
130
135
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
131
136
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
@@ -138,14 +143,17 @@ logger: logging.Logger = logging.getLogger(__name__)
138
143
  @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
139
144
  @capability(
140
145
  SourceCapability.USAGE_STATS,
141
- "Enabled by default, can be disabled via configuration `include_usage_statistics`",
146
+ "Optionally enabled via `include_usage_statistics`",
147
+ )
148
+ @capability(
149
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
142
150
  )
143
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
144
151
  @capability(
145
152
  SourceCapability.CLASSIFICATION,
146
153
  "Optionally enabled via `classification.enabled`",
147
154
  supported=True,
148
155
  )
156
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
149
157
  class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
150
158
  """
151
159
  This plugin extracts the following:
@@ -354,7 +362,23 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
354
362
  ).workunit_processor,
355
363
  ]
356
364
 
365
+ def _warn_deprecated_configs(self):
366
+ if (
367
+ self.config.match_fully_qualified_names is not None
368
+ and not self.config.match_fully_qualified_names
369
+ and self.config.schema_pattern is not None
370
+ and self.config.schema_pattern != AllowDenyPattern.allow_all()
371
+ ):
372
+ self.report.report_warning(
373
+ message="Please update `schema_pattern` to match against fully qualified schema name `<database_name>.<schema_name>` and set config `match_fully_qualified_names : True`."
374
+ "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. "
375
+ "The config option `match_fully_qualified_names` will be removed in future and the default behavior will be like `match_fully_qualified_names: True`.",
376
+ context="Config option deprecation warning",
377
+ title="Config option deprecation warning",
378
+ )
379
+
357
380
  def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]:
381
+ self._warn_deprecated_configs()
358
382
  connection = self._try_get_redshift_connection(self.config)
359
383
 
360
384
  if connection is None:
@@ -395,40 +419,25 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
395
419
  memory_footprint.total_size(self.db_views)
396
420
  )
397
421
 
398
- if self.config.use_lineage_v2:
399
- with RedshiftSqlLineageV2(
400
- config=self.config,
401
- report=self.report,
402
- context=self.ctx,
403
- database=database,
404
- redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
405
- ) as lineage_extractor:
406
- yield from lineage_extractor.aggregator.register_schemas_from_stream(
407
- self.process_schemas(connection, database)
408
- )
409
-
410
- with self.report.new_stage(LINEAGE_EXTRACTION):
411
- yield from self.extract_lineage_v2(
412
- connection=connection,
413
- database=database,
414
- lineage_extractor=lineage_extractor,
415
- )
416
-
417
- all_tables = self.get_all_tables()
418
- else:
419
- yield from self.process_schemas(connection, database)
422
+ with RedshiftSqlLineage(
423
+ config=self.config,
424
+ report=self.report,
425
+ context=self.ctx,
426
+ database=database,
427
+ redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
428
+ ) as lineage_extractor:
429
+ yield from lineage_extractor.aggregator.register_schemas_from_stream(
430
+ self.process_schemas(connection, database)
431
+ )
420
432
 
421
- all_tables = self.get_all_tables()
433
+ with self.report.new_stage(LINEAGE_EXTRACTION):
434
+ yield from self.extract_lineage_v2(
435
+ connection=connection,
436
+ database=database,
437
+ lineage_extractor=lineage_extractor,
438
+ )
422
439
 
423
- if (
424
- self.config.include_table_lineage
425
- or self.config.include_view_lineage
426
- or self.config.include_copy_lineage
427
- ):
428
- with self.report.new_stage(LINEAGE_EXTRACTION):
429
- yield from self.extract_lineage(
430
- connection=connection, all_tables=all_tables, database=database
431
- )
440
+ all_tables = self.get_all_tables()
432
441
 
433
442
  if self.config.include_usage_statistics:
434
443
  with self.report.new_stage(USAGE_EXTRACTION_INGESTION):
@@ -437,7 +446,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
437
446
  )
438
447
 
439
448
  if self.config.is_profiling_enabled():
440
- with self.report.new_stage(PROFILING):
449
+ with self.report.new_high_stage(IngestionHighStage.PROFILING):
441
450
  profiler = RedshiftProfiler(
442
451
  config=self.config,
443
452
  report=self.report,
@@ -940,45 +949,11 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
940
949
 
941
950
  self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2)
942
951
 
943
- def extract_lineage(
944
- self,
945
- connection: redshift_connector.Connection,
946
- database: str,
947
- all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
948
- ) -> Iterable[MetadataWorkUnit]:
949
- if not self._should_ingest_lineage():
950
- return
951
-
952
- lineage_extractor = RedshiftLineageExtractor(
953
- config=self.config,
954
- report=self.report,
955
- context=self.ctx,
956
- redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
957
- )
958
-
959
- with PerfTimer() as timer:
960
- lineage_extractor.populate_lineage(
961
- database=database, connection=connection, all_tables=all_tables
962
- )
963
-
964
- self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds(
965
- digits=2
966
- )
967
- yield from self.generate_lineage(
968
- database, lineage_extractor=lineage_extractor
969
- )
970
-
971
- if self.redundant_lineage_run_skip_handler:
972
- # Update the checkpoint state for this run.
973
- self.redundant_lineage_run_skip_handler.update_state(
974
- self.config.start_time, self.config.end_time
975
- )
976
-
977
952
  def extract_lineage_v2(
978
953
  self,
979
954
  connection: redshift_connector.Connection,
980
955
  database: str,
981
- lineage_extractor: RedshiftSqlLineageV2,
956
+ lineage_extractor: RedshiftSqlLineage,
982
957
  ) -> Iterable[MetadataWorkUnit]:
983
958
  if self.config.include_share_lineage:
984
959
  outbound_shares = self.data_dictionary.get_outbound_datashares(connection)
@@ -1041,40 +1016,6 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
1041
1016
 
1042
1017
  return True
1043
1018
 
1044
- def generate_lineage(
1045
- self, database: str, lineage_extractor: RedshiftLineageExtractor
1046
- ) -> Iterable[MetadataWorkUnit]:
1047
- logger.info(f"Generate lineage for {database}")
1048
- for schema in deduplicate_list(
1049
- itertools.chain(self.db_tables[database], self.db_views[database])
1050
- ):
1051
- if (
1052
- database not in self.db_schemas
1053
- or schema not in self.db_schemas[database]
1054
- ):
1055
- logger.warning(
1056
- f"Either database {database} or {schema} exists in the lineage but was not discovered earlier. Something went wrong."
1057
- )
1058
- continue
1059
-
1060
- table_or_view: Union[RedshiftTable, RedshiftView]
1061
- for table_or_view in (
1062
- []
1063
- + self.db_tables[database].get(schema, [])
1064
- + self.db_views[database].get(schema, [])
1065
- ):
1066
- datahub_dataset_name = f"{database}.{schema}.{table_or_view.name}"
1067
- dataset_urn = self.gen_dataset_urn(datahub_dataset_name)
1068
-
1069
- lineage_info = lineage_extractor.get_lineage(
1070
- table_or_view,
1071
- dataset_urn,
1072
- self.db_schemas[database][schema],
1073
- )
1074
- if lineage_info:
1075
- # incremental lineage generation is taken care by auto_incremental_lineage
1076
- yield from gen_lineage(dataset_urn, lineage_info)
1077
-
1078
1019
  def add_config_to_report(self):
1079
1020
  self.report.stateful_lineage_ingestion_enabled = (
1080
1021
  self.config.enable_stateful_lineage_ingestion
@@ -15,6 +15,7 @@ from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable
15
15
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
16
16
  from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult
17
17
  from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
18
+ from datahub.utilities.perf_timer import PerfTimer
18
19
 
19
20
  logger: logging.Logger = logging.getLogger(__name__)
20
21
 
@@ -243,9 +244,13 @@ class RedshiftDataDictionary:
243
244
  conn: redshift_connector.Connection, query: str
244
245
  ) -> redshift_connector.Cursor:
245
246
  cursor: redshift_connector.Cursor = conn.cursor()
246
-
247
- logger.debug(f"Query : {query}")
248
- cursor.execute(query)
247
+ with PerfTimer() as timer:
248
+ query_hash_id = hash(query)
249
+ logger.info(f"Executing query [{query_hash_id}]\n{query}")
250
+ cursor.execute(query)
251
+ logger.info(
252
+ f"Time taken query [{query_hash_id}: {timer.elapsed_seconds():.3f} seconds"
253
+ )
249
254
  return cursor
250
255
 
251
256
  @staticmethod
@@ -545,8 +550,7 @@ class RedshiftDataDictionary:
545
550
  conn: redshift_connector.Connection,
546
551
  query: str,
547
552
  ) -> Iterable[LineageRow]:
548
- cursor = conn.cursor()
549
- cursor.execute(query)
553
+ cursor = RedshiftDataDictionary.get_query_result(conn=conn, query=query)
550
554
  field_names = [i[0] for i in cursor.description]
551
555
 
552
556
  rows = cursor.fetchmany()
@@ -603,9 +607,7 @@ class RedshiftDataDictionary:
603
607
  conn: redshift_connector.Connection,
604
608
  query: str,
605
609
  ) -> Iterable[TempTableRow]:
606
- cursor = conn.cursor()
607
-
608
- cursor.execute(query)
610
+ cursor = RedshiftDataDictionary.get_query_result(conn=conn, query=query)
609
611
 
610
612
  field_names = [i[0] for i in cursor.description]
611
613
 
@@ -662,8 +664,9 @@ class RedshiftDataDictionary:
662
664
  def get_outbound_datashares(
663
665
  conn: redshift_connector.Connection,
664
666
  ) -> Iterable[OutboundDatashare]:
665
- cursor = conn.cursor()
666
- cursor.execute(RedshiftCommonQuery.list_outbound_datashares())
667
+ cursor = RedshiftDataDictionary.get_query_result(
668
+ conn=conn, query=RedshiftCommonQuery.list_outbound_datashares()
669
+ )
667
670
  for item in cursor.fetchall():
668
671
  yield OutboundDatashare(
669
672
  share_name=item[1],
@@ -678,8 +681,10 @@ class RedshiftDataDictionary:
678
681
  conn: redshift_connector.Connection,
679
682
  database: str,
680
683
  ) -> Optional[InboundDatashare]:
681
- cursor = conn.cursor()
682
- cursor.execute(RedshiftCommonQuery.get_inbound_datashare(database))
684
+ cursor = RedshiftDataDictionary.get_query_result(
685
+ conn=conn,
686
+ query=RedshiftCommonQuery.get_inbound_datashare(database),
687
+ )
683
688
  item = cursor.fetchone()
684
689
  if item:
685
690
  return InboundDatashare(
@@ -4,7 +4,6 @@ from typing import Dict, Optional
4
4
 
5
5
  from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
6
6
  from datahub.ingestion.source.sql.sql_report import SQLSourceReport
7
- from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
8
7
  from datahub.ingestion.source_report.time_window import BaseTimeWindowReport
9
8
  from datahub.sql_parsing.sql_parsing_aggregator import SqlAggregatorReport
10
9
  from datahub.utilities.lossy_collections import LossyDict
@@ -15,7 +14,6 @@ from datahub.utilities.stats_collections import TopKDict
15
14
  @dataclass
16
15
  class RedshiftReport(
17
16
  SQLSourceReport,
18
- IngestionStageReport,
19
17
  BaseTimeWindowReport,
20
18
  ClassificationReportMixin,
21
19
  ):
@@ -25,6 +25,7 @@ from datahub.ingestion.source.redshift.query import (
25
25
  RedshiftServerlessQuery,
26
26
  )
27
27
  from datahub.ingestion.source.redshift.redshift_schema import (
28
+ RedshiftDataDictionary,
28
29
  RedshiftTable,
29
30
  RedshiftView,
30
31
  )
@@ -182,9 +183,10 @@ class RedshiftUsageExtractor:
182
183
  self.report.num_operational_stats_filtered = 0
183
184
 
184
185
  if self.config.include_operational_stats:
185
- with self.report.new_stage(
186
- USAGE_EXTRACTION_OPERATIONAL_STATS
187
- ), PerfTimer() as timer:
186
+ with (
187
+ self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS),
188
+ PerfTimer() as timer,
189
+ ):
188
190
  # Generate operation aspect workunits
189
191
  yield from self._gen_operation_aspect_workunits(
190
192
  self.connection, all_tables
@@ -262,8 +264,7 @@ class RedshiftUsageExtractor:
262
264
  connection: redshift_connector.Connection,
263
265
  all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
264
266
  ) -> Iterable[RedshiftAccessEvent]:
265
- cursor = connection.cursor()
266
- cursor.execute(query)
267
+ cursor = RedshiftDataDictionary.get_query_result(conn=connection, query=query)
267
268
  results = cursor.fetchmany()
268
269
  field_names = [i[0] for i in cursor.description]
269
270
  while results:
@@ -1,19 +1,21 @@
1
1
  import dataclasses
2
2
  from dataclasses import field as dataclass_field
3
- from typing import List
4
3
 
5
4
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
5
  StaleEntityRemovalSourceReport,
7
6
  )
7
+ from datahub.utilities.lossy_collections import LossyList
8
8
 
9
9
 
10
10
  @dataclasses.dataclass
11
11
  class DataLakeSourceReport(StaleEntityRemovalSourceReport):
12
12
  files_scanned = 0
13
- filtered: List[str] = dataclass_field(default_factory=list)
13
+ filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
14
+ number_of_files_filtered: int = 0
14
15
 
15
16
  def report_file_scanned(self) -> None:
16
17
  self.files_scanned += 1
17
18
 
18
19
  def report_file_dropped(self, file: str) -> None:
19
20
  self.filtered.append(file)
21
+ self.number_of_files_filtered += 1