acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,20 @@
1
1
  import logging
2
- import traceback
3
2
  from collections import defaultdict
4
3
  from dataclasses import dataclass, field
5
4
  from datetime import datetime
6
5
  from enum import Enum
7
- from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
6
+ from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
8
7
  from urllib.parse import urlparse
9
8
 
10
- import humanfriendly
11
9
  import redshift_connector
12
10
  import sqlglot
13
11
 
14
- import datahub.emitter.mce_builder as builder
15
12
  import datahub.sql_parsing.sqlglot_lineage as sqlglot_l
16
13
  from datahub.emitter import mce_builder
17
14
  from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
15
+ from datahub.ingestion.api.closeable import Closeable
18
16
  from datahub.ingestion.api.common import PipelineContext
17
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
19
18
  from datahub.ingestion.source.aws.s3_util import strip_s3_prefix
20
19
  from datahub.ingestion.source.redshift.config import LineageMode, RedshiftConfig
21
20
  from datahub.ingestion.source.redshift.query import (
@@ -35,30 +34,20 @@ from datahub.ingestion.source.redshift.report import RedshiftReport
35
34
  from datahub.ingestion.source.state.redundant_run_skip_handler import (
36
35
  RedundantLineageRunSkipHandler,
37
36
  )
38
- from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
39
- FineGrainedLineage,
40
- FineGrainedLineageDownstreamType,
41
- FineGrainedLineageUpstreamType,
42
- UpstreamLineage,
43
- )
44
- from datahub.metadata.com.linkedin.pegasus2avro.schema import (
45
- OtherSchema,
46
- SchemaField,
47
- SchemaMetadata,
48
- )
49
37
  from datahub.metadata.schema_classes import (
50
38
  DatasetLineageTypeClass,
51
- UpstreamClass,
52
- UpstreamLineageClass,
53
39
  )
54
40
  from datahub.metadata.urns import DatasetUrn
55
- from datahub.sql_parsing.schema_resolver import SchemaResolver
56
- from datahub.sql_parsing.sql_parsing_aggregator import TableRename
41
+ from datahub.sql_parsing.sql_parsing_aggregator import (
42
+ KnownQueryLineageInfo,
43
+ ObservedQuery,
44
+ SqlParsingAggregator,
45
+ TableRename,
46
+ )
57
47
  from datahub.sql_parsing.sqlglot_utils import get_dialect, parse_statement
58
- from datahub.utilities import memory_footprint
59
- from datahub.utilities.dedup_list import deduplicate_list
48
+ from datahub.utilities.perf_timer import PerfTimer
60
49
 
61
- logger: logging.Logger = logging.getLogger(__name__)
50
+ logger = logging.getLogger(__name__)
62
51
 
63
52
 
64
53
  class LineageDatasetPlatform(Enum):
@@ -100,30 +89,6 @@ class LineageItem:
100
89
  else:
101
90
  self.dataset_lineage_type = DatasetLineageTypeClass.TRANSFORMED
102
91
 
103
- def merge_lineage(
104
- self,
105
- upstreams: Set[LineageDataset],
106
- cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
107
- ) -> None:
108
- self.upstreams = self.upstreams.union(upstreams)
109
-
110
- # Merge CLL using the output column name as the merge key.
111
- self.cll = self.cll or []
112
- existing_cll: Dict[str, sqlglot_l.ColumnLineageInfo] = {
113
- c.downstream.column: c for c in self.cll
114
- }
115
- for c in cll or []:
116
- if c.downstream.column in existing_cll:
117
- # Merge using upstream + column name as the merge key.
118
- existing_cll[c.downstream.column].upstreams = deduplicate_list(
119
- [*existing_cll[c.downstream.column].upstreams, *c.upstreams]
120
- )
121
- else:
122
- # New output column, just add it as is.
123
- self.cll.append(c)
124
-
125
- self.cll = self.cll or None
126
-
127
92
 
128
93
  def parse_alter_table_rename(default_schema: str, query: str) -> Tuple[str, str, str]:
129
94
  """
@@ -142,117 +107,48 @@ def parse_alter_table_rename(default_schema: str, query: str) -> Tuple[str, str,
142
107
  return schema, prev_name, new_name
143
108
 
144
109
 
145
- def split_qualified_table_name(urn: str) -> Tuple[str, str, str]:
146
- qualified_table_name = DatasetUrn.from_string(urn).name
147
-
148
- # -3 because platform instance is optional and that can cause the split to have more than 3 elements
149
- db, schema, table = qualified_table_name.split(".")[-3:]
110
+ class RedshiftSqlLineage(Closeable):
111
+ # does lineage and usage based on SQL parsing.
150
112
 
151
- return db, schema, table
152
-
153
-
154
- class RedshiftLineageExtractor:
155
113
  def __init__(
156
114
  self,
157
115
  config: RedshiftConfig,
158
116
  report: RedshiftReport,
159
117
  context: PipelineContext,
118
+ database: str,
160
119
  redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = None,
161
120
  ):
121
+ self.platform = "redshift"
162
122
  self.config = config
163
123
  self.report = report
164
124
  self.context = context
165
- self._lineage_map: Dict[str, LineageItem] = defaultdict()
125
+ self.database = database
126
+ self.known_urns: Set[str] = set() # will be set later
127
+ self.redundant_run_skip_handler = redundant_run_skip_handler
128
+
129
+ self.aggregator = SqlParsingAggregator(
130
+ platform=self.platform,
131
+ platform_instance=self.config.platform_instance,
132
+ env=self.config.env,
133
+ generate_lineage=True,
134
+ generate_queries=self.config.lineage_generate_queries,
135
+ generate_usage_statistics=False,
136
+ generate_operations=False,
137
+ usage_config=self.config,
138
+ graph=self.context.graph,
139
+ is_temp_table=self._is_temp_table,
140
+ )
141
+ self.report.sql_aggregator = self.aggregator.report
166
142
 
167
143
  self.queries: RedshiftCommonQuery = RedshiftProvisionedQuery()
168
144
  if self.config.is_serverless:
169
145
  self.queries = RedshiftServerlessQuery()
170
146
 
171
- self.redundant_run_skip_handler = redundant_run_skip_handler
172
147
  self.start_time, self.end_time = (
173
148
  self.report.lineage_start_time,
174
149
  self.report.lineage_end_time,
175
150
  ) = self.get_time_window()
176
151
 
177
- self.temp_tables: Dict[str, TempTableRow] = {}
178
-
179
- def _init_temp_table_schema(
180
- self, database: str, temp_tables: List[TempTableRow]
181
- ) -> None:
182
- if self.context.graph is None: # to silent lint
183
- return
184
-
185
- schema_resolver: SchemaResolver = self.context.graph._make_schema_resolver(
186
- platform=LineageDatasetPlatform.REDSHIFT.value,
187
- platform_instance=self.config.platform_instance,
188
- env=self.config.env,
189
- )
190
-
191
- dataset_vs_columns: Dict[str, List[SchemaField]] = {}
192
- # prepare dataset_urn vs List of schema fields
193
- for table in temp_tables:
194
- logger.debug(
195
- f"Processing temp table: {table.create_command} with query text {table.query_text}"
196
- )
197
- result = sqlglot_l.create_lineage_sql_parsed_result(
198
- platform=LineageDatasetPlatform.REDSHIFT.value,
199
- platform_instance=self.config.platform_instance,
200
- env=self.config.env,
201
- default_db=database,
202
- default_schema=self.config.default_schema,
203
- query=table.query_text,
204
- graph=self.context.graph,
205
- )
206
-
207
- if (
208
- result is None
209
- or result.column_lineage is None
210
- or not result.query_type.is_create()
211
- or not result.out_tables
212
- ):
213
- logger.debug(f"Unsupported temp table query found: {table.query_text}")
214
- continue
215
-
216
- table.parsed_result = result
217
- if result.column_lineage[0].downstream.table:
218
- table.urn = result.column_lineage[0].downstream.table
219
-
220
- self.temp_tables[result.out_tables[0]] = table
221
-
222
- for table in self.temp_tables.values():
223
- if (
224
- table.parsed_result is None
225
- or table.urn is None
226
- or table.parsed_result.column_lineage is None
227
- ):
228
- continue
229
-
230
- # Initialise the temp table urn, we later need this to merge CLL
231
- downstream_urn = table.urn
232
- if downstream_urn not in dataset_vs_columns:
233
- dataset_vs_columns[downstream_urn] = []
234
- dataset_vs_columns[downstream_urn].extend(
235
- sqlglot_l.infer_output_schema(table.parsed_result) or []
236
- )
237
-
238
- # Add datasets, and it's respective fields in schema_resolver, so that later schema_resolver would be able
239
- # correctly generates the upstreams for temporary tables
240
- for urn in dataset_vs_columns:
241
- db, schema, table_name = split_qualified_table_name(urn)
242
- schema_resolver.add_schema_metadata(
243
- urn=urn,
244
- schema_metadata=SchemaMetadata(
245
- schemaName=table_name,
246
- platform=builder.make_data_platform_urn(
247
- LineageDatasetPlatform.REDSHIFT.value
248
- ),
249
- version=0,
250
- hash="",
251
- platformSchema=OtherSchema(rawSchema=""),
252
- fields=dataset_vs_columns[urn],
253
- ),
254
- )
255
-
256
152
  def get_time_window(self) -> Tuple[datetime, datetime]:
257
153
  if self.redundant_run_skip_handler:
258
154
  self.report.stateful_lineage_ingestion_enabled = True
@@ -262,9 +158,20 @@ class RedshiftLineageExtractor:
262
158
  else:
263
159
  return self.config.start_time, self.config.end_time
264
160
 
265
- def warn(self, log: logging.Logger, key: str, reason: str) -> None:
266
- # TODO: Remove this method.
267
- self.report.warning(key, reason)
161
+ def report_status(self, step: str, status: bool) -> None:
162
+ if self.redundant_run_skip_handler:
163
+ self.redundant_run_skip_handler.report_current_run_status(step, status)
164
+
165
+ def _is_temp_table(self, name: str) -> bool:
166
+ return (
167
+ DatasetUrn.create_from_ids(
168
+ self.platform,
169
+ name,
170
+ env=self.config.env,
171
+ platform_instance=self.config.platform_instance,
172
+ ).urn()
173
+ not in self.known_urns
174
+ )
268
175
 
269
176
  def _get_s3_path(self, path: str) -> Optional[str]:
270
177
  if self.config.s3_lineage_config:
@@ -289,6 +196,15 @@ class RedshiftLineageExtractor:
289
196
 
290
197
  return path
291
198
 
199
+ def _build_s3_path_from_row(self, filename: str) -> Optional[str]:
200
+ path = filename.strip()
201
+ if urlparse(path).scheme != "s3":
202
+ raise ValueError(
203
+ f"Only s3 source supported with copy/unload. The source was: {path}"
204
+ )
205
+ s3_path = self._get_s3_path(path)
206
+ return strip_s3_prefix(s3_path) if s3_path else None
207
+
292
208
  def _get_sources_from_query(
293
209
  self,
294
210
  db_name: str,
@@ -335,15 +251,6 @@ class RedshiftLineageExtractor:
335
251
  ),
336
252
  )
337
253
 
338
- def _build_s3_path_from_row(self, filename: str) -> Optional[str]:
339
- path = filename.strip()
340
- if urlparse(path).scheme != "s3":
341
- raise ValueError(
342
- f"Only s3 source supported with copy/unload. The source was: {path}"
343
- )
344
- s3_path = self._get_s3_path(path)
345
- return strip_s3_prefix(s3_path) if s3_path else None
346
-
347
254
  def _get_sources(
348
255
  self,
349
256
  lineage_type: LineageCollectorType,
@@ -418,112 +325,6 @@ class RedshiftLineageExtractor:
418
325
 
419
326
  return sources, cll
420
327
 
421
- def _populate_lineage_map(
422
- self,
423
- query: str,
424
- database: str,
425
- lineage_type: LineageCollectorType,
426
- connection: redshift_connector.Connection,
427
- all_tables_set: Dict[str, Dict[str, Set[str]]],
428
- ) -> None:
429
- """
430
- This method generate table level lineage based with the given query.
431
- The query should return the following columns: target_schema, target_table, source_table, source_schema
432
- source_table and source_schema can be omitted if the sql_field is set because then it assumes the source_table
433
- and source_schema will be extracted from the sql_field by sql parsing.
434
-
435
- :param query: The query to run to extract lineage.
436
- :type query: str
437
- :param lineage_type: The way the lineage should be processed
438
- :type lineage_type: LineageType
439
- return: The method does not return with anything as it directly modify the self._lineage_map property.
440
- :rtype: None
441
- """
442
-
443
- logger.info(f"Extracting {lineage_type.name} lineage for db {database}")
444
- try:
445
- logger.debug(f"Processing lineage query: {query}")
446
- cll: Optional[List[sqlglot_l.ColumnLineageInfo]] = None
447
- raw_db_name = database
448
- alias_db_name = self.config.database
449
-
450
- for lineage_row in RedshiftDataDictionary.get_lineage_rows(
451
- conn=connection, query=query
452
- ):
453
- target = self._get_target_lineage(
454
- alias_db_name,
455
- lineage_row,
456
- lineage_type,
457
- all_tables_set=all_tables_set,
458
- )
459
- if not target:
460
- continue
461
-
462
- logger.debug(
463
- f"Processing {lineage_type.name} lineage row: {lineage_row}"
464
- )
465
-
466
- sources, cll = self._get_sources(
467
- lineage_type,
468
- alias_db_name,
469
- source_schema=lineage_row.source_schema,
470
- source_table=lineage_row.source_table,
471
- ddl=lineage_row.ddl,
472
- filename=lineage_row.filename,
473
- )
474
-
475
- target.upstreams.update(
476
- self._get_upstream_lineages(
477
- sources=sources,
478
- target_table=target.dataset.urn,
479
- target_dataset_cll=cll,
480
- all_tables_set=all_tables_set,
481
- alias_db_name=alias_db_name,
482
- raw_db_name=raw_db_name,
483
- connection=connection,
484
- )
485
- )
486
- target.cll = cll
487
-
488
- # Merging upstreams if dataset already exists and has upstreams
489
- if target.dataset.urn in self._lineage_map:
490
- self._lineage_map[target.dataset.urn].merge_lineage(
491
- upstreams=target.upstreams, cll=target.cll
492
- )
493
- else:
494
- self._lineage_map[target.dataset.urn] = target
495
-
496
- logger.debug(
497
- f"Lineage[{target}]:{self._lineage_map[target.dataset.urn]}"
498
- )
499
- except Exception as e:
500
- self.warn(
501
- logger,
502
- f"extract-{lineage_type.name}",
503
- f"Error was {e}, {traceback.format_exc()}",
504
- )
505
- self.report_status(f"extract-{lineage_type.name}", False)
506
-
507
- def _update_lineage_map_for_table_renames(
508
- self, table_renames: Dict[str, TableRename]
509
- ) -> None:
510
- if not table_renames:
511
- return
512
-
513
- logger.info(f"Updating lineage map for {len(table_renames)} table renames")
514
- for entry in table_renames.values():
515
- # This table was renamed from some other name, copy in the lineage
516
- # for the previous name as well.
517
- prev_table_lineage = self._lineage_map.get(entry.original_urn)
518
- if prev_table_lineage:
519
- logger.debug(
520
- f"including lineage for {entry.original_urn} in {entry.new_urn} due to table rename"
521
- )
522
- self._lineage_map[entry.new_urn].merge_lineage(
523
- upstreams=prev_table_lineage.upstreams,
524
- cll=prev_table_lineage.cll,
525
- )
526
-
527
328
  def _get_target_lineage(
528
329
  self,
529
330
  alias_db_name: str,
@@ -569,7 +370,7 @@ class RedshiftLineageExtractor:
569
370
  ),
570
371
  )
571
372
  except ValueError as e:
572
- self.warn(logger, "non-s3-lineage", str(e))
373
+ self.report.warning("non-s3-lineage", str(e))
573
374
  return None
574
375
  else:
575
376
  target_platform = LineageDatasetPlatform.REDSHIFT
@@ -588,269 +389,6 @@ class RedshiftLineageExtractor:
588
389
  cll=None,
589
390
  )
590
391
 
591
- def _get_upstream_lineages(
592
- self,
593
- sources: List[LineageDataset],
594
- target_table: str,
595
- all_tables_set: Dict[str, Dict[str, Set[str]]],
596
- alias_db_name: str,
597
- raw_db_name: str,
598
- connection: redshift_connector.Connection,
599
- target_dataset_cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
600
- ) -> List[LineageDataset]:
601
- target_source = []
602
- probable_temp_tables: List[str] = []
603
-
604
- for source in sources:
605
- if source.platform == LineageDatasetPlatform.REDSHIFT:
606
- db, schema, table = split_qualified_table_name(source.urn)
607
- if db == raw_db_name:
608
- db = alias_db_name
609
- path = f"{db}.{schema}.{table}"
610
- source = LineageDataset(
611
- platform=source.platform,
612
- urn=make_dataset_urn_with_platform_instance(
613
- platform=LineageDatasetPlatform.REDSHIFT.value,
614
- platform_instance=self.config.platform_instance,
615
- name=path,
616
- env=self.config.env,
617
- ),
618
- )
619
-
620
- # Filtering out tables which does not exist in Redshift
621
- # It was deleted in the meantime or query parser did not capture well the table name
622
- # Or it might be a temp table
623
- if (
624
- db not in all_tables_set
625
- or schema not in all_tables_set[db]
626
- or table not in all_tables_set[db][schema]
627
- ):
628
- logger.debug(
629
- f"{source.urn} missing table. Adding it to temp table list for target table {target_table}.",
630
- )
631
- probable_temp_tables.append(f"{schema}.{table}")
632
- self.report.num_lineage_tables_dropped += 1
633
- continue
634
-
635
- target_source.append(source)
636
-
637
- if probable_temp_tables and self.config.resolve_temp_table_in_lineage:
638
- self.report.num_lineage_processed_temp_tables += len(probable_temp_tables)
639
- # Generate lineage dataset from temporary tables
640
- number_of_permanent_dataset_found: int = (
641
- self.update_table_and_column_lineage(
642
- db_name=raw_db_name,
643
- connection=connection,
644
- temp_table_names=probable_temp_tables,
645
- target_source_dataset=target_source,
646
- target_dataset_cll=target_dataset_cll,
647
- )
648
- )
649
-
650
- logger.debug(
651
- f"Number of permanent datasets found for {target_table} = {number_of_permanent_dataset_found} in "
652
- f"temp tables {probable_temp_tables}"
653
- )
654
-
655
- return target_source
656
-
657
- def populate_lineage(
658
- self,
659
- database: str,
660
- connection: redshift_connector.Connection,
661
- all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
662
- ) -> None:
663
- if self.config.resolve_temp_table_in_lineage:
664
- self._init_temp_table_schema(
665
- database=database,
666
- temp_tables=list(self.get_temp_tables(connection=connection)),
667
- )
668
-
669
- populate_calls: List[Tuple[str, LineageCollectorType]] = []
670
-
671
- all_tables_set: Dict[str, Dict[str, Set[str]]] = {
672
- db: {schema: {t.name for t in tables} for schema, tables in schemas.items()}
673
- for db, schemas in all_tables.items()
674
- }
675
-
676
- table_renames: Dict[str, TableRename] = {}
677
- if self.config.include_table_rename_lineage:
678
- table_renames, all_tables_set = self._process_table_renames(
679
- database=database,
680
- connection=connection,
681
- all_tables=all_tables_set,
682
- )
683
-
684
- if self.config.table_lineage_mode in {
685
- LineageMode.STL_SCAN_BASED,
686
- LineageMode.MIXED,
687
- }:
688
- # Populate table level lineage by getting upstream tables from stl_scan redshift table
689
- query = self.queries.stl_scan_based_lineage_query(
690
- self.config.database,
691
- self.start_time,
692
- self.end_time,
693
- )
694
- populate_calls.append((query, LineageCollectorType.QUERY_SCAN))
695
- if self.config.table_lineage_mode in {
696
- LineageMode.SQL_BASED,
697
- LineageMode.MIXED,
698
- }:
699
- # Populate table level lineage by parsing table creating sqls
700
- query = self.queries.list_insert_create_queries_sql(
701
- db_name=database,
702
- start_time=self.start_time,
703
- end_time=self.end_time,
704
- )
705
- populate_calls.append((query, LineageCollectorType.QUERY_SQL_PARSER))
706
-
707
- if self.config.include_views and self.config.include_view_lineage:
708
- # Populate table level lineage for views
709
- query = self.queries.view_lineage_query()
710
- populate_calls.append((query, LineageCollectorType.VIEW))
711
-
712
- # Populate table level lineage for late binding views
713
- query = self.queries.list_late_view_ddls_query()
714
- populate_calls.append((query, LineageCollectorType.VIEW_DDL_SQL_PARSING))
715
-
716
- if self.config.include_copy_lineage:
717
- query = self.queries.list_copy_commands_sql(
718
- db_name=database,
719
- start_time=self.start_time,
720
- end_time=self.end_time,
721
- )
722
- populate_calls.append((query, LineageCollectorType.COPY))
723
-
724
- if self.config.include_unload_lineage:
725
- query = self.queries.list_unload_commands_sql(
726
- db_name=database,
727
- start_time=self.start_time,
728
- end_time=self.end_time,
729
- )
730
-
731
- populate_calls.append((query, LineageCollectorType.UNLOAD))
732
-
733
- for query, lineage_type in populate_calls:
734
- self._populate_lineage_map(
735
- query=query,
736
- database=database,
737
- lineage_type=lineage_type,
738
- connection=connection,
739
- all_tables_set=all_tables_set,
740
- )
741
-
742
- # Handling for alter table statements.
743
- self._update_lineage_map_for_table_renames(table_renames=table_renames)
744
-
745
- self.report.lineage_mem_size[self.config.database] = humanfriendly.format_size(
746
- memory_footprint.total_size(self._lineage_map)
747
- )
748
-
749
- def make_fine_grained_lineage_class(
750
- self, lineage_item: LineageItem, dataset_urn: str
751
- ) -> List[FineGrainedLineage]:
752
- fine_grained_lineages: List[FineGrainedLineage] = []
753
-
754
- if (
755
- self.config.extract_column_level_lineage is False
756
- or lineage_item.cll is None
757
- ):
758
- logger.debug("CLL extraction is disabled")
759
- return fine_grained_lineages
760
-
761
- logger.debug("Extracting column level lineage")
762
-
763
- cll: List[sqlglot_l.ColumnLineageInfo] = lineage_item.cll
764
-
765
- for cll_info in cll:
766
- downstream = (
767
- [builder.make_schema_field_urn(dataset_urn, cll_info.downstream.column)]
768
- if cll_info.downstream is not None
769
- and cll_info.downstream.column is not None
770
- else []
771
- )
772
-
773
- upstreams = [
774
- builder.make_schema_field_urn(column_ref.table, column_ref.column)
775
- for column_ref in cll_info.upstreams
776
- ]
777
-
778
- fine_grained_lineages.append(
779
- FineGrainedLineage(
780
- downstreamType=FineGrainedLineageDownstreamType.FIELD,
781
- downstreams=downstream,
782
- upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
783
- upstreams=upstreams,
784
- )
785
- )
786
-
787
- logger.debug(f"Created fine_grained_lineage for {dataset_urn}")
788
-
789
- return fine_grained_lineages
790
-
791
- def get_lineage(
792
- self,
793
- table: Union[RedshiftTable, RedshiftView],
794
- dataset_urn: str,
795
- schema: RedshiftSchema,
796
- ) -> Optional[UpstreamLineageClass]:
797
- upstream_lineage: List[UpstreamClass] = []
798
-
799
- cll_lineage: List[FineGrainedLineage] = []
800
-
801
- if dataset_urn in self._lineage_map:
802
- item = self._lineage_map[dataset_urn]
803
- for upstream in item.upstreams:
804
- upstream_table = UpstreamClass(
805
- dataset=upstream.urn,
806
- type=item.dataset_lineage_type,
807
- )
808
- upstream_lineage.append(upstream_table)
809
-
810
- cll_lineage = self.make_fine_grained_lineage_class(
811
- lineage_item=item,
812
- dataset_urn=dataset_urn,
813
- )
814
-
815
- tablename = table.name
816
- if (
817
- table.is_external_table()
818
- and schema.is_external_schema()
819
- and schema.external_platform
820
- ):
821
- # external_db_params = schema.option
822
- upstream_platform = schema.external_platform.lower()
823
- catalog_upstream = UpstreamClass(
824
- mce_builder.make_dataset_urn_with_platform_instance(
825
- upstream_platform,
826
- f"{schema.external_database}.{tablename}",
827
- platform_instance=(
828
- self.config.platform_instance_map.get(upstream_platform)
829
- if self.config.platform_instance_map
830
- else None
831
- ),
832
- env=self.config.env,
833
- ),
834
- DatasetLineageTypeClass.COPY,
835
- )
836
- upstream_lineage.append(catalog_upstream)
837
-
838
- if upstream_lineage:
839
- self.report.upstream_lineage[dataset_urn] = [
840
- u.dataset for u in upstream_lineage
841
- ]
842
- else:
843
- return None
844
-
845
- return UpstreamLineage(
846
- upstreams=upstream_lineage,
847
- fineGrainedLineages=cll_lineage or None,
848
- )
849
-
850
- def report_status(self, step: str, status: bool) -> None:
851
- if self.redundant_run_skip_handler:
852
- self.redundant_run_skip_handler.report_current_run_status(step, status)
853
-
854
392
  def _process_table_renames(
855
393
  self,
856
394
  database: str,
@@ -924,204 +462,365 @@ class RedshiftLineageExtractor:
924
462
  ):
925
463
  yield row
926
464
 
927
- def find_temp_tables(
928
- self, temp_table_rows: List[TempTableRow], temp_table_names: List[str]
929
- ) -> List[TempTableRow]:
930
- matched_temp_tables: List[TempTableRow] = []
465
+ def build(
466
+ self,
467
+ connection: redshift_connector.Connection,
468
+ all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
469
+ db_schemas: Dict[str, Dict[str, RedshiftSchema]],
470
+ ) -> None:
471
+ # Assume things not in `all_tables` as temp tables.
472
+ self.known_urns = {
473
+ DatasetUrn.create_from_ids(
474
+ self.platform,
475
+ f"{db}.{schema}.{table.name}",
476
+ env=self.config.env,
477
+ platform_instance=self.config.platform_instance,
478
+ ).urn()
479
+ for db, schemas in all_tables.items()
480
+ for schema, tables in schemas.items()
481
+ for table in tables
482
+ }
483
+
484
+ # Handle all the temp tables up front.
485
+ if self.config.resolve_temp_table_in_lineage:
486
+ for temp_row in self.get_temp_tables(connection=connection):
487
+ self.aggregator.add_observed_query(
488
+ ObservedQuery(
489
+ query=temp_row.query_text,
490
+ default_db=self.database,
491
+ default_schema=self.config.default_schema,
492
+ session_id=temp_row.session_id,
493
+ timestamp=temp_row.start_time,
494
+ ),
495
+ # The "temp table" query actually returns all CREATE TABLE statements, even if they
496
+ # aren't explicitly a temp table. As such, setting is_known_temp_table=True
497
+ # would not be correct. We already have mechanisms to autodetect temp tables,
498
+ # so we won't lose anything by not setting it.
499
+ is_known_temp_table=False,
500
+ )
501
+
502
+ populate_calls: List[Tuple[LineageCollectorType, str, Callable]] = []
931
503
 
932
- for table_name in temp_table_names:
933
- prefixes = self.queries.get_temp_table_clause(table_name)
934
- prefixes.extend(
935
- self.queries.get_temp_table_clause(table_name.split(".")[-1])
504
+ if self.config.include_table_rename_lineage:
505
+ # Process all the ALTER TABLE RENAME statements
506
+ table_renames, _ = self._process_table_renames(
507
+ database=self.database,
508
+ connection=connection,
509
+ all_tables=defaultdict(lambda: defaultdict(set)),
936
510
  )
511
+ for entry in table_renames.values():
512
+ self.aggregator.add_table_rename(entry)
937
513
 
938
- for row in temp_table_rows:
939
- if any(
940
- row.create_command.lower().startswith(prefix) for prefix in prefixes
941
- ):
942
- matched_temp_tables.append(row)
514
+ if self.config.table_lineage_mode in {
515
+ LineageMode.SQL_BASED,
516
+ LineageMode.MIXED,
517
+ }:
518
+ # Populate lineage by parsing table creating sqls
519
+ query = self.queries.list_insert_create_queries_sql(
520
+ db_name=self.database,
521
+ start_time=self.start_time,
522
+ end_time=self.end_time,
523
+ )
524
+ populate_calls.append(
525
+ (
526
+ LineageCollectorType.QUERY_SQL_PARSER,
527
+ query,
528
+ self._process_sql_parser_lineage,
529
+ )
530
+ )
531
+ if self.config.table_lineage_mode in {
532
+ LineageMode.STL_SCAN_BASED,
533
+ LineageMode.MIXED,
534
+ }:
535
+ # Populate lineage by getting upstream tables from stl_scan redshift table
536
+ query = self.queries.stl_scan_based_lineage_query(
537
+ self.database,
538
+ self.start_time,
539
+ self.end_time,
540
+ )
541
+ populate_calls.append(
542
+ (LineageCollectorType.QUERY_SCAN, query, self._process_stl_scan_lineage)
543
+ )
943
544
 
944
- return matched_temp_tables
545
+ if self.config.include_views and self.config.include_view_lineage:
546
+ # Populate lineage for views
547
+ query = self.queries.view_lineage_query()
548
+ populate_calls.append(
549
+ (LineageCollectorType.VIEW, query, self._process_view_lineage)
550
+ )
945
551
 
946
- def resolve_column_refs(
947
- self, column_refs: List[sqlglot_l.ColumnRef], depth: int = 0
948
- ) -> List[sqlglot_l.ColumnRef]:
949
- """
950
- This method resolves the column reference to the original column reference.
951
- For example, if the column reference is to a temporary table, it will be resolved to the original column
952
- reference.
953
- """
954
- max_depth = 10
552
+ # Populate lineage for late binding views
553
+ query = self.queries.list_late_view_ddls_query()
554
+ populate_calls.append(
555
+ (
556
+ LineageCollectorType.VIEW_DDL_SQL_PARSING,
557
+ query,
558
+ self._process_view_lineage,
559
+ )
560
+ )
561
+
562
+ if self.config.include_copy_lineage:
563
+ # Populate lineage for copy commands.
564
+ query = self.queries.list_copy_commands_sql(
565
+ db_name=self.database,
566
+ start_time=self.start_time,
567
+ end_time=self.end_time,
568
+ )
569
+ populate_calls.append(
570
+ (LineageCollectorType.COPY, query, self._process_copy_command)
571
+ )
955
572
 
956
- resolved_column_refs: List[sqlglot_l.ColumnRef] = []
957
- if not column_refs:
958
- return column_refs
573
+ if self.config.include_unload_lineage:
574
+ # Populate lineage for unload commands.
575
+ query = self.queries.list_unload_commands_sql(
576
+ db_name=self.database,
577
+ start_time=self.start_time,
578
+ end_time=self.end_time,
579
+ )
580
+ populate_calls.append(
581
+ (LineageCollectorType.UNLOAD, query, self._process_unload_command)
582
+ )
959
583
 
960
- if depth >= max_depth:
961
- logger.warning(
962
- f"Max depth reached for resolving temporary columns: {column_refs}"
584
+ for lineage_type, query, processor in populate_calls:
585
+ self._populate_lineage_agg(
586
+ query=query,
587
+ lineage_type=lineage_type,
588
+ processor=processor,
589
+ connection=connection,
963
590
  )
964
- self.report.num_unresolved_temp_columns += 1
965
- return column_refs
966
-
967
- for ref in column_refs:
968
- resolved = False
969
- if ref.table in self.temp_tables:
970
- table = self.temp_tables[ref.table]
971
- if table.parsed_result and table.parsed_result.column_lineage:
972
- for column_lineage in table.parsed_result.column_lineage:
973
- if (
974
- column_lineage.downstream.table == ref.table
975
- and column_lineage.downstream.column == ref.column
976
- ):
977
- resolved_column_refs.extend(
978
- self.resolve_column_refs(
979
- column_lineage.upstreams, depth=depth + 1
980
- )
981
- )
982
- resolved = True
983
- break
984
- # If we reach here, it means that we were not able to resolve the column reference.
985
- if resolved is False:
986
- logger.warning(
987
- f"Unable to resolve column reference {ref} to a permanent table"
988
- )
989
- else:
990
- logger.debug(
991
- f"Resolved column reference {ref} is not resolved because referenced table {ref.table} is not a temp table or not found. Adding reference as non-temp table. This is normal."
992
- )
993
- resolved_column_refs.append(ref)
994
- return resolved_column_refs
995
591
 
996
- def _update_target_dataset_cll(
997
- self,
998
- temp_table_urn: str,
999
- target_dataset_cll: List[sqlglot_l.ColumnLineageInfo],
1000
- source_dataset_cll: List[sqlglot_l.ColumnLineageInfo],
1001
- ) -> None:
1002
- for target_column_lineage in target_dataset_cll:
1003
- upstreams: List[sqlglot_l.ColumnRef] = []
1004
- # Look for temp_table_urn in upstream of column_lineage, if found then we need to replace it with
1005
- # column of permanent table
1006
- for target_column_ref in target_column_lineage.upstreams:
1007
- if target_column_ref.table == temp_table_urn:
1008
- # Look for column_ref.table and column_ref.column in downstream of source_dataset_cll.
1009
- # The source_dataset_cll contains CLL generated from create statement of temp table (temp_table_urn)
1010
- for source_column_lineage in source_dataset_cll:
1011
- if (
1012
- source_column_lineage.downstream.table
1013
- == target_column_ref.table
1014
- and source_column_lineage.downstream.column
1015
- == target_column_ref.column
1016
- ):
1017
- resolved_columns = self.resolve_column_refs(
1018
- source_column_lineage.upstreams
1019
- )
1020
- # Add all upstream of above temporary column into upstream of target column
1021
- upstreams.extend(resolved_columns)
1022
- continue
1023
-
1024
- upstreams.append(target_column_ref)
1025
-
1026
- if upstreams:
1027
- # update the upstreams
1028
- target_column_lineage.upstreams = upstreams
1029
-
1030
- def _add_permanent_datasets_recursively(
592
+ # Populate lineage for external tables.
593
+ if not self.config.skip_external_tables:
594
+ self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
595
+
596
+ def _populate_lineage_agg(
1031
597
  self,
1032
- db_name: str,
1033
- temp_table_rows: List[TempTableRow],
1034
- visited_tables: Set[str],
598
+ query: str,
599
+ lineage_type: LineageCollectorType,
600
+ processor: Callable[[LineageRow], None],
1035
601
  connection: redshift_connector.Connection,
1036
- permanent_lineage_datasets: List[LineageDataset],
1037
- target_dataset_cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
1038
602
  ) -> None:
1039
- transitive_temp_tables: List[TempTableRow] = []
603
+ logger.info(f"Extracting {lineage_type.name} lineage for db {self.database}")
604
+ try:
605
+ logger.debug(f"Processing {lineage_type.name} lineage query: {query}")
1040
606
 
1041
- for temp_table in temp_table_rows:
1042
- logger.debug(
1043
- f"Processing temp table with transaction id: {temp_table.transaction_id} and query text {temp_table.query_text}"
607
+ timer = self.report.lineage_phases_timer.setdefault(
608
+ lineage_type.name, PerfTimer()
609
+ )
610
+ with timer:
611
+ for lineage_row in RedshiftDataDictionary.get_lineage_rows(
612
+ conn=connection, query=query
613
+ ):
614
+ processor(lineage_row)
615
+ except Exception as e:
616
+ self.report.warning(
617
+ title="Failed to extract some lineage",
618
+ message=f"Failed to extract lineage of type {lineage_type.name}",
619
+ context=f"Query: '{query}'",
620
+ exc=e,
1044
621
  )
622
+ self.report_status(f"extract-{lineage_type.name}", False)
623
+
624
+ def _process_sql_parser_lineage(self, lineage_row: LineageRow) -> None:
625
+ ddl = lineage_row.ddl
626
+ if ddl is None:
627
+ return
628
+
629
+ # TODO actor
1045
630
 
1046
- intermediate_l_datasets, cll = self._get_sources_from_query(
1047
- db_name=db_name,
1048
- query=temp_table.query_text,
1049
- parsed_result=temp_table.parsed_result,
631
+ self.aggregator.add_observed_query(
632
+ ObservedQuery(
633
+ query=ddl,
634
+ default_db=self.database,
635
+ default_schema=self.config.default_schema,
636
+ timestamp=lineage_row.timestamp,
637
+ session_id=lineage_row.session_id,
1050
638
  )
639
+ )
1051
640
 
1052
- if (
1053
- temp_table.urn is not None
1054
- and target_dataset_cll is not None
1055
- and cll is not None
1056
- ): # condition to silent the lint
1057
- self._update_target_dataset_cll(
1058
- temp_table_urn=temp_table.urn,
1059
- target_dataset_cll=target_dataset_cll,
1060
- source_dataset_cll=cll,
1061
- )
641
+ def _make_filtered_target(self, lineage_row: LineageRow) -> Optional[DatasetUrn]:
642
+ target = DatasetUrn.create_from_ids(
643
+ self.platform,
644
+ f"{self.database}.{lineage_row.target_schema}.{lineage_row.target_table}",
645
+ env=self.config.env,
646
+ platform_instance=self.config.platform_instance,
647
+ )
648
+ if target.urn() not in self.known_urns:
649
+ logger.debug(
650
+ f"Skipping lineage for {target.urn()} as it is not in known_urns"
651
+ )
652
+ return None
1062
653
 
1063
- # make sure lineage dataset should not contain a temp table
1064
- # if such dataset is present then add it to transitive_temp_tables to resolve it to original permanent table
1065
- for lineage_dataset in intermediate_l_datasets:
1066
- db, schema, table = split_qualified_table_name(lineage_dataset.urn)
654
+ return target
1067
655
 
1068
- if table in visited_tables:
1069
- # The table is already processed
1070
- continue
656
+ def _process_stl_scan_lineage(self, lineage_row: LineageRow) -> None:
657
+ target = self._make_filtered_target(lineage_row)
658
+ if not target:
659
+ return
1071
660
 
1072
- # Check if table found is again a temp table
1073
- repeated_temp_table: List[TempTableRow] = self.find_temp_tables(
1074
- temp_table_rows=list(self.temp_tables.values()),
1075
- temp_table_names=[table],
1076
- )
661
+ source = DatasetUrn.create_from_ids(
662
+ self.platform,
663
+ f"{self.database}.{lineage_row.source_schema}.{lineage_row.source_table}",
664
+ env=self.config.env,
665
+ platform_instance=self.config.platform_instance,
666
+ )
1077
667
 
1078
- if not repeated_temp_table:
1079
- logger.debug(f"Unable to find table {table} in temp tables.")
668
+ if lineage_row.ddl is None:
669
+ logger.warning(
670
+ f"stl scan entry is missing query text for {lineage_row.source_schema}.{lineage_row.source_table}"
671
+ )
672
+ return
673
+ self.aggregator.add_known_query_lineage(
674
+ KnownQueryLineageInfo(
675
+ query_text=lineage_row.ddl,
676
+ downstream=target.urn(),
677
+ upstreams=[source.urn()],
678
+ timestamp=lineage_row.timestamp,
679
+ ),
680
+ merge_lineage=True,
681
+ )
682
+
683
+ def _process_view_lineage(self, lineage_row: LineageRow) -> None:
684
+ ddl = lineage_row.ddl
685
+ if ddl is None:
686
+ return
1080
687
 
1081
- if repeated_temp_table:
1082
- transitive_temp_tables.extend(repeated_temp_table)
1083
- visited_tables.add(table)
1084
- continue
688
+ target = self._make_filtered_target(lineage_row)
689
+ if not target:
690
+ return
1085
691
 
1086
- permanent_lineage_datasets.append(lineage_dataset)
692
+ self.aggregator.add_view_definition(
693
+ view_urn=target,
694
+ view_definition=ddl,
695
+ default_db=self.database,
696
+ default_schema=self.config.default_schema,
697
+ )
1087
698
 
1088
- if transitive_temp_tables:
1089
- # recursive call
1090
- self._add_permanent_datasets_recursively(
1091
- db_name=db_name,
1092
- temp_table_rows=transitive_temp_tables,
1093
- visited_tables=visited_tables,
1094
- connection=connection,
1095
- permanent_lineage_datasets=permanent_lineage_datasets,
1096
- target_dataset_cll=target_dataset_cll,
699
+ def _process_copy_command(self, lineage_row: LineageRow) -> None:
700
+ logger.debug(f"Processing COPY command for lineage row: {lineage_row}")
701
+ sources = self._get_sources(
702
+ lineage_type=LineageCollectorType.COPY,
703
+ db_name=self.database,
704
+ source_schema=None,
705
+ source_table=None,
706
+ ddl=None,
707
+ filename=lineage_row.filename,
708
+ )
709
+ logger.debug(f"Recognized sources: {sources}")
710
+ source = sources[0]
711
+ if not source:
712
+ logger.debug("Ignoring command since couldn't recognize proper source")
713
+ return
714
+ s3_urn = source[0].urn
715
+ logger.debug(f"Recognized s3 dataset urn: {s3_urn}")
716
+ if not lineage_row.target_schema or not lineage_row.target_table:
717
+ logger.debug(
718
+ f"Didn't find target schema (found: {lineage_row.target_schema}) or target table (found: {lineage_row.target_table})"
1097
719
  )
720
+ return
721
+ target = self._make_filtered_target(lineage_row)
722
+ if not target:
723
+ return
1098
724
 
1099
- def update_table_and_column_lineage(
1100
- self,
1101
- db_name: str,
1102
- temp_table_names: List[str],
1103
- connection: redshift_connector.Connection,
1104
- target_source_dataset: List[LineageDataset],
1105
- target_dataset_cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
1106
- ) -> int:
1107
- permanent_lineage_datasets: List[LineageDataset] = []
1108
-
1109
- temp_table_rows: List[TempTableRow] = self.find_temp_tables(
1110
- temp_table_rows=list(self.temp_tables.values()),
1111
- temp_table_names=temp_table_names,
725
+ self.aggregator.add_known_lineage_mapping(
726
+ upstream_urn=s3_urn, downstream_urn=target.urn()
1112
727
  )
1113
728
 
1114
- visited_tables: Set[str] = set(temp_table_names)
729
+ def _process_unload_command(self, lineage_row: LineageRow) -> None:
730
+ lineage_entry = self._get_target_lineage(
731
+ alias_db_name=self.database,
732
+ lineage_row=lineage_row,
733
+ lineage_type=LineageCollectorType.UNLOAD,
734
+ all_tables_set={},
735
+ )
736
+ if not lineage_entry:
737
+ return
738
+ output_urn = lineage_entry.dataset.urn
1115
739
 
1116
- self._add_permanent_datasets_recursively(
1117
- db_name=db_name,
1118
- temp_table_rows=temp_table_rows,
1119
- visited_tables=visited_tables,
1120
- connection=connection,
1121
- permanent_lineage_datasets=permanent_lineage_datasets,
1122
- target_dataset_cll=target_dataset_cll,
740
+ if not lineage_row.source_schema or not lineage_row.source_table:
741
+ return
742
+ source = DatasetUrn.create_from_ids(
743
+ self.platform,
744
+ f"{self.database}.{lineage_row.source_schema}.{lineage_row.source_table}",
745
+ env=self.config.env,
746
+ platform_instance=self.config.platform_instance,
1123
747
  )
748
+ if source.urn() not in self.known_urns:
749
+ logger.debug(
750
+ f"Skipping unload lineage for {source.urn()} as it is not in known_urns"
751
+ )
752
+ return
753
+
754
+ self.aggregator.add_known_lineage_mapping(
755
+ upstream_urn=source.urn(), downstream_urn=output_urn
756
+ )
757
+
758
+ def _process_external_tables(
759
+ self,
760
+ all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
761
+ db_schemas: Dict[str, Dict[str, RedshiftSchema]],
762
+ ) -> None:
763
+ for schema_name, tables in all_tables[self.database].items():
764
+ logger.info(f"External table lineage: checking schema {schema_name}")
765
+ if not db_schemas[self.database].get(schema_name):
766
+ logger.warning(f"Schema {schema_name} not found")
767
+ continue
768
+ for table in tables:
769
+ schema = db_schemas[self.database][schema_name]
770
+ if (
771
+ table.is_external_table()
772
+ and schema.is_external_schema()
773
+ and schema.external_platform
774
+ ):
775
+ logger.info(
776
+ f"External table lineage: processing table {schema_name}.{table.name}"
777
+ )
778
+ # external_db_params = schema.option
779
+ upstream_platform = schema.external_platform.lower()
780
+
781
+ table_urn = mce_builder.make_dataset_urn_with_platform_instance(
782
+ self.platform,
783
+ f"{self.database}.{schema_name}.{table.name}",
784
+ platform_instance=self.config.platform_instance,
785
+ env=self.config.env,
786
+ )
787
+ if upstream_platform == self.platform:
788
+ upstream_schema = schema.get_upstream_schema_name() or "public"
789
+ upstream_dataset_name = (
790
+ f"{schema.external_database}.{upstream_schema}.{table.name}"
791
+ )
792
+ upstream_platform_instance = self.config.platform_instance
793
+ else:
794
+ upstream_dataset_name = (
795
+ f"{schema.external_database}.{table.name}"
796
+ )
797
+ upstream_platform_instance = (
798
+ self.config.platform_instance_map.get(upstream_platform)
799
+ if self.config.platform_instance_map
800
+ else None
801
+ )
1124
802
 
1125
- target_source_dataset.extend(permanent_lineage_datasets)
803
+ upstream_urn = mce_builder.make_dataset_urn_with_platform_instance(
804
+ upstream_platform,
805
+ upstream_dataset_name,
806
+ platform_instance=upstream_platform_instance,
807
+ env=self.config.env,
808
+ )
809
+
810
+ self.aggregator.add_known_lineage_mapping(
811
+ upstream_urn=upstream_urn,
812
+ downstream_urn=table_urn,
813
+ )
814
+
815
+ def generate(self) -> Iterable[MetadataWorkUnit]:
816
+ for mcp in self.aggregator.gen_metadata():
817
+ yield mcp.as_workunit()
818
+ if len(self.aggregator.report.observed_query_parse_failures) > 0:
819
+ self.report.report_warning(
820
+ title="Failed to extract some SQL lineage",
821
+ message="Unexpected error(s) while attempting to extract lineage from SQL queries. See the full logs for more details.",
822
+ context=f"Query Parsing Failures: {self.aggregator.report.observed_query_parse_failures}",
823
+ )
1126
824
 
1127
- return len(permanent_lineage_datasets)
825
+ def close(self) -> None:
826
+ self.aggregator.close()