acryl-datahub 0.14.1.13rc3__py3-none-any.whl → 0.14.1.13rc5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (25) hide show
  1. {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/METADATA +2493 -2493
  2. {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/RECORD +25 -20
  3. {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/entry_points.txt +2 -2
  4. datahub/__init__.py +1 -1
  5. datahub/configuration/kafka.py +12 -1
  6. datahub/configuration/kafka_consumer_config.py +35 -0
  7. datahub/ingestion/source/confluent_schema_registry.py +4 -2
  8. datahub/ingestion/source/kafka/__init__.py +0 -0
  9. datahub/ingestion/source/{kafka.py → kafka/kafka.py} +12 -2
  10. datahub/ingestion/source/sql/mssql/job_models.py +1 -0
  11. datahub/ingestion/source/sql/mssql/source.py +113 -38
  12. datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py +84 -0
  13. datahub/ingestion/source/sql/sql_common.py +10 -2
  14. datahub/sql_parsing/datajob.py +50 -0
  15. datahub/sql_parsing/query_types.py +10 -1
  16. datahub/sql_parsing/split_statements.py +163 -0
  17. datahub/sql_parsing/sql_parsing_aggregator.py +0 -1
  18. datahub/sql_parsing/sql_parsing_common.py +6 -0
  19. datahub/sql_parsing/sqlglot_lineage.py +49 -7
  20. datahub/sql_parsing/sqlglot_utils.py +1 -1
  21. datahub/utilities/file_backed_collections.py +6 -0
  22. {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/WHEEL +0 -0
  23. {acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/top_level.txt +0 -0
  24. /datahub/ingestion/source/{kafka_connect.py → kafka/kafka_connect.py} +0 -0
  25. /datahub/ingestion/source/{kafka_schema_registry_base.py → kafka/kafka_schema_registry_base.py} +0 -0
@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=P9sbPaOoA3VjmcHU2vfWz0jZKxxkuH45Iab7XNKqP2Q,577
1
+ datahub/__init__.py,sha256=sacWO6qm2tPLBhc25GFoFnt_AeiT0Qk9ZiibpHJbPhQ,577
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -96,7 +96,8 @@ datahub/configuration/datetimes.py,sha256=nayNc0mmlVKH6oVv9ud6C1dDUiZPGabW-YZxvr
96
96
  datahub/configuration/git.py,sha256=s55eUHxKqVZgtVsISaDyS-1F4iZBiybbjYsjbp5LU5o,6135
97
97
  datahub/configuration/import_resolver.py,sha256=b4Ie9L7knN1LALEVMxTcNFSklDD6CVE-4Ipy4ZYhNYA,369
98
98
  datahub/configuration/json_loader.py,sha256=vIDnjwXWi9yHDO8KW64EupOzOb_sspehGCD7xGHzg84,302
99
- datahub/configuration/kafka.py,sha256=GFHXnTyDAEHhpxnVUOepQ7aKYZOd9mTQyx1-MM5m88A,2126
99
+ datahub/configuration/kafka.py,sha256=MlIwpd5FFyOyjdDXW_X9JTLNk7f988sPMgevkcZYVgI,2579
100
+ datahub/configuration/kafka_consumer_config.py,sha256=DSwUU4HoqNyK4CNk9eIbX3eYsJMGQvORDiy1ZxkjlRc,1022
100
101
  datahub/configuration/pattern_utils.py,sha256=Q5IB9RfWOOo5FvRVBU7XkhiwHCxSQ1NTMfUlWtWI9qc,699
101
102
  datahub/configuration/pydantic_migration_helpers.py,sha256=4C_COAVZ5iJ8yxcWNgXZNWsY7ULogICNZ368oNF7zWg,1462
102
103
  datahub/configuration/source_common.py,sha256=68LZOuB23zSEcfgQJE1wZQnyYQHVVnEZK3Sniv_nEQs,2107
@@ -182,7 +183,7 @@ datahub/ingestion/sink/datahub_rest.py,sha256=17MjPW4F6LwLjZyATIWdDdIYvC74VH6g1j
182
183
  datahub/ingestion/sink/file.py,sha256=SxXJPJpkIGoaqRjCcSmj2ZE3xE4rLlBABBGwpTj5LWI,3271
183
184
  datahub/ingestion/sink/sink_registry.py,sha256=JRBWx8qEYg0ubSTyhqwgSWctgxwyp6fva9GoN2LwBao,490
184
185
  datahub/ingestion/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
185
- datahub/ingestion/source/confluent_schema_registry.py,sha256=K2LrEKCukw-kqa2AHsRE7MQYS4A416RY5YtpJt4esvg,18783
186
+ datahub/ingestion/source/confluent_schema_registry.py,sha256=_h9D8bUXoaGcwgwB94dX6aTyLY5ve7XGdcVFSJHGSJc,18804
186
187
  datahub/ingestion/source/csv_enricher.py,sha256=xjCbcsSMM8l_ASCRAnNsUGKuYMrD1lec19Waixub1EM,29498
187
188
  datahub/ingestion/source/demo_data.py,sha256=yzA_R-wfSX2WPz0i5ukYlscpmpb0Pt8D7EkhtKfftvo,1286
188
189
  datahub/ingestion/source/elastic_search.py,sha256=qFUVNzynTVJTabASTjGMu8Qhf9UpNbEtSBFjaPQjBJE,22641
@@ -191,9 +192,6 @@ datahub/ingestion/source/file.py,sha256=pH-Qkjh5FQ2XvyYPE7Z8XEY4vUk_SUHxm8p8IxG1
191
192
  datahub/ingestion/source/ge_data_profiler.py,sha256=7oUvvADX4t1WiXwOruCURh3sNEY_7I41wbwXFvKaKWM,63587
192
193
  datahub/ingestion/source/ge_profiling_config.py,sha256=WusEMGFPu17y99jTT-1tTOiN87Q1sY7nyxXvXnPs1-E,10489
193
194
  datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
194
- datahub/ingestion/source/kafka.py,sha256=lGTkbwPvi8ysa0HaPaoWVHYblsPLB1js92cHbj2keKY,24876
195
- datahub/ingestion/source/kafka_connect.py,sha256=5KUlhn3876c41Z3kx5l4oJhbu0ekXZQRdxmu52vb_v8,55167
196
- datahub/ingestion/source/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
197
195
  datahub/ingestion/source/ldap.py,sha256=Vnzg8tpwBYeyM-KBVVsUJvGZGBMJiCJ_i_FhxaFRQ9A,18627
198
196
  datahub/ingestion/source/metabase.py,sha256=oemiMdzjfr82Hx6rdwTNBzFM8962LDkosYh7SD_I5cY,31717
199
197
  datahub/ingestion/source/mlflow.py,sha256=SxCt4jtxQcpPWEI2rRNagCiE_6TWr2RroqmxRd_td1Y,11565
@@ -320,6 +318,10 @@ datahub/ingestion/source/iceberg/iceberg_profiler.py,sha256=hLT1Le_TEUoFXvsJSlrR
320
318
  datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
321
319
  datahub/ingestion/source/identity/azure_ad.py,sha256=GdmJFD4UMsb5353Z7phXRf-YsXR2woGLRJwBXUkgXq0,28809
322
320
  datahub/ingestion/source/identity/okta.py,sha256=PnRokWLG8wSoNZlXJiRZiW6APTEHO09q4n2j_l6m3V0,30756
321
+ datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
322
+ datahub/ingestion/source/kafka/kafka.py,sha256=wC5tZ31fvXWLjse9qH3YONPhMxXIZ9QV9vbBcWDAP_w,25352
323
+ datahub/ingestion/source/kafka/kafka_connect.py,sha256=5KUlhn3876c41Z3kx5l4oJhbu0ekXZQRdxmu52vb_v8,55167
324
+ datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
323
325
  datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
324
326
  datahub/ingestion/source/looker/lkml_patched.py,sha256=XShEU7Wbz0DubDhYMjKf9wjKZrBJa2XPg9MIjp8rPhk,733
325
327
  datahub/ingestion/source/looker/looker_common.py,sha256=cIrsc0nDEsODxx9tjvSm2A3-OlCkPKCu4W2L8PxzLWs,59926
@@ -447,7 +449,7 @@ datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_
447
449
  datahub/ingestion/source/sql/oracle.py,sha256=exJJvMSC42Oj0IiFtpAPFatWkRu9mVu8sd7ofOdanqU,22460
448
450
  datahub/ingestion/source/sql/postgres.py,sha256=uC1kYEI8VdxiZ1Y9IxMWzwmg11wtMqYN0e2fkok1rxo,11972
449
451
  datahub/ingestion/source/sql/presto.py,sha256=PB-CS5MX2dSRFRHjlxfkLHGXLZXFNCsVAAyRBtY6HMg,3611
450
- datahub/ingestion/source/sql/sql_common.py,sha256=SZkPK7aUiTdMHf5alG5724i1pyAn1jFUjcuDkKUoeL0,51813
452
+ datahub/ingestion/source/sql/sql_common.py,sha256=Qe481Pct1vfl-NVpgOufSEaNSMgJGD3bq1DPvjIkPTg,52164
451
453
  datahub/ingestion/source/sql/sql_config.py,sha256=M-l_uXau0ODolLZHBzAXhy-Rq5yYxvJ6cLbCIea7Mww,9449
452
454
  datahub/ingestion/source/sql/sql_generic.py,sha256=9AERvkK8kdJUeDOzCYJDb93xdv6Z4DGho0NfeHj5Uyg,2740
453
455
  datahub/ingestion/source/sql/sql_generic_profiler.py,sha256=vWna43rv9ClfK9MBNTx8tdoUr35nD5d8ppgeamEEhSQ,12528
@@ -460,8 +462,9 @@ datahub/ingestion/source/sql/trino.py,sha256=FEn_BQ3pm23hKx94ek5kk5IXGNYcBqZEhll
460
462
  datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=YDrGBb5WKVls6qv17QU5foKrf71SydzEltc3WsVAhQc,5732
461
463
  datahub/ingestion/source/sql/vertica.py,sha256=pkx-1JDBmow7WhoEIEh0SLj7kff9L0zUOHDytoC43gk,33339
462
464
  datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
463
- datahub/ingestion/source/sql/mssql/job_models.py,sha256=-wDCltoBAFHaydAV4Gs0vm-j7qKNCH5iEJOVWanhlZw,5981
464
- datahub/ingestion/source/sql/mssql/source.py,sha256=_qenSWn5mPrr5EPDB6Qbt9EOURqesK68Fm_pnh9sO58,26594
465
+ datahub/ingestion/source/sql/mssql/job_models.py,sha256=eMyR0Efl5kvi7QNgNXzd5_6PdDKYly_552Y8OGSj9PY,6012
466
+ datahub/ingestion/source/sql/mssql/source.py,sha256=fzpWjwexGvJgpd6Z4DCsK6Ld2vQCfPkD2M1xE4pU9Ec,29542
467
+ datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py,sha256=RpnvKPalAAaOD_eUg8bZ4VkGTSeLFWuy0mefwc4s3x8,2837
465
468
  datahub/ingestion/source/state/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
466
469
  datahub/ingestion/source/state/checkpoint.py,sha256=x9Xww-MIFXSKjeg1tOZXE72LehCm5OfKy3HfucgIRWM,8833
467
470
  datahub/ingestion/source/state/entity_removal_state.py,sha256=zvIsmYg7oiIu2FhecU0VfLBNToUqvKoKyDeiFfkOcyc,6611
@@ -856,13 +859,15 @@ datahub/specific/structured_property.py,sha256=IYeFyafPidNrDbn1sU65rEPwIZDS-wLY1
856
859
  datahub/sql_parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
857
860
  datahub/sql_parsing/_models.py,sha256=il-xm1RcLdi1phJUV3xrTecdOGH31akqheuSC2N4YhQ,3141
858
861
  datahub/sql_parsing/_sqlglot_patch.py,sha256=iYJ8zOThHqqbamD5jdNr9iHTWD7ewNeHzPiTb6-rO3Y,7043
859
- datahub/sql_parsing/query_types.py,sha256=AxoWZiLuXM6iZ2AXdSlD82gqPNr1ZFh78wn2MSTu2wY,2479
862
+ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn0,1751
863
+ datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
860
864
  datahub/sql_parsing/schema_resolver.py,sha256=9wbJT80K4nsIHHOuQLso9QoFQAYwfSJZnqHwsaU3UTY,10197
861
- datahub/sql_parsing/sql_parsing_aggregator.py,sha256=YzYmWNQjXuQYcfh91ZnqjuzCDWdmZpE-ZX14Kz6XFMs,69938
862
- datahub/sql_parsing/sql_parsing_common.py,sha256=ptTwHXdAHuw0jbhIalEkB3mncDx5LsQeclRPecPzPFE,2339
865
+ datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
866
+ datahub/sql_parsing/sql_parsing_aggregator.py,sha256=gLelf5l73EufB8qijb9ZDLANkt4o05schGg4DY-bOJs,69937
867
+ datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
863
868
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
864
- datahub/sql_parsing/sqlglot_lineage.py,sha256=jVMYZ5S36rHALiGmefQSGliRVcafLrodmfBUrTu_-9A,44787
865
- datahub/sql_parsing/sqlglot_utils.py,sha256=nVFG0UnHibApwNyJMEjdnJxgHreR9p9lN0Z3X6vhbnU,14448
869
+ datahub/sql_parsing/sqlglot_lineage.py,sha256=K532_zvj7vEt2Ci2d3kxYZLZkq4S0ECBRZ2nHuY11a4,45927
870
+ datahub/sql_parsing/sqlglot_utils.py,sha256=8MYzkyekhup3ihVStRPuwneWPNu17xhBg5SG8iVfFRY,14431
866
871
  datahub/sql_parsing/tool_meta_extractor.py,sha256=pE-pkRKBfNTXEJkaQM9NlG807mc-X6OtetgskJySCs8,2908
867
872
  datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
868
873
  datahub/telemetry/stats.py,sha256=YltbtC3fe6rl1kcxn1A-mSnVpECTPm5k-brrUt7QxTI,967
@@ -888,7 +893,7 @@ datahub/utilities/dedup_list.py,sha256=dUSpe1AajfuwlHVJKNv-CzDXSCkaw0HgSMOsxqUkQ
888
893
  datahub/utilities/delayed_iter.py,sha256=XlsI0DCXkVVejFKOW_uMT0E8DTqqOHQN3Ooak4EcULE,645
889
894
  datahub/utilities/delta.py,sha256=hkpF8W7Lvg2gUJBQR3mmIzOxsRQ6i5cchRPFlAVoV10,1128
890
895
  datahub/utilities/docs_build.py,sha256=uFMK3z1d4BExpsrvguHunidbEDAzQ8hoOP7iQ0A_IVw,211
891
- datahub/utilities/file_backed_collections.py,sha256=5Og8AJV5WxbY7N8QOQWcRh8aNa7C3zrFKyAMaHddRmE,20232
896
+ datahub/utilities/file_backed_collections.py,sha256=tS6hN0kxMaaTb8rB-vDqAd973mTKshERSC55JIe_3Cw,20557
892
897
  datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
893
898
  datahub/utilities/hive_schema_to_avro.py,sha256=2-7NI9haCAYbyUmHTb-QPxjn4WbmnDDKAIGmHJ11-1E,11622
894
899
  datahub/utilities/is_pytest.py,sha256=2m9T4S9IIKhI5RfTqrB2ZmumzHocdxBHpM1HroWj2XQ,138
@@ -965,8 +970,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
965
970
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
966
971
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
967
972
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
968
- acryl_datahub-0.14.1.13rc3.dist-info/METADATA,sha256=TighbabeG0cW7RwOPQ1AvwAcwZeZ4LWaGRzKrKDY_vE,171138
969
- acryl_datahub-0.14.1.13rc3.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
970
- acryl_datahub-0.14.1.13rc3.dist-info/entry_points.txt,sha256=tOZmkU-DL9O6-BRhkwS095b_y2RlSKQ7j0hvp57iw1Q,9420
971
- acryl_datahub-0.14.1.13rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
972
- acryl_datahub-0.14.1.13rc3.dist-info/RECORD,,
973
+ acryl_datahub-0.14.1.13rc5.dist-info/METADATA,sha256=9ZZBowQ_PYKf6d4V6EHqTTlJDY77BqxSX8uKfiIqFT4,171138
974
+ acryl_datahub-0.14.1.13rc5.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
975
+ acryl_datahub-0.14.1.13rc5.dist-info/entry_points.txt,sha256=VcQx0dnqaYLyeY_L5OaX7bLmmE-Il7TAXkxCKvEn2bA,9432
976
+ acryl_datahub-0.14.1.13rc5.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
977
+ acryl_datahub-0.14.1.13rc5.dist-info/RECORD,,
@@ -56,8 +56,8 @@ hive = datahub.ingestion.source.sql.hive:HiveSource
56
56
  hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource
57
57
  iceberg = datahub.ingestion.source.iceberg.iceberg:IcebergSource
58
58
  json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource
59
- kafka = datahub.ingestion.source.kafka:KafkaSource
60
- kafka-connect = datahub.ingestion.source.kafka_connect:KafkaConnectSource
59
+ kafka = datahub.ingestion.source.kafka.kafka:KafkaSource
60
+ kafka-connect = datahub.ingestion.source.kafka.kafka_connect:KafkaConnectSource
61
61
  ldap = datahub.ingestion.source.ldap:LDAPSource
62
62
  looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource
63
63
  lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.14.1.13rc3"
6
+ __version__ = "0.14.1.13rc5"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -1,6 +1,7 @@
1
1
  from pydantic import Field, validator
2
2
 
3
- from datahub.configuration.common import ConfigModel
3
+ from datahub.configuration.common import ConfigModel, ConfigurationError
4
+ from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
4
5
  from datahub.configuration.validate_host_port import validate_host_port
5
6
 
6
7
 
@@ -36,6 +37,16 @@ class KafkaConsumerConnectionConfig(_KafkaConnectionConfig):
36
37
  description="Extra consumer config serialized as JSON. These options will be passed into Kafka's DeserializingConsumer. See https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer and https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md .",
37
38
  )
38
39
 
40
+ @validator("consumer_config")
41
+ @classmethod
42
+ def resolve_callback(cls, value: dict) -> dict:
43
+ if CallableConsumerConfig.is_callable_config(value):
44
+ try:
45
+ value = CallableConsumerConfig(value).callable_config()
46
+ except Exception as e:
47
+ raise ConfigurationError(e)
48
+ return value
49
+
39
50
 
40
51
  class KafkaProducerConnectionConfig(_KafkaConnectionConfig):
41
52
  """Configuration class for holding connectivity information for Kafka producers"""
@@ -0,0 +1,35 @@
1
+ import logging
2
+ from typing import Any, Dict, Optional
3
+
4
+ from datahub.ingestion.api.registry import import_path
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class CallableConsumerConfig:
10
+ CALLBACK_ATTRIBUTE: str = "oauth_cb"
11
+
12
+ def __init__(self, config: Dict[str, Any]):
13
+ self._config = config
14
+
15
+ self._resolve_oauth_callback()
16
+
17
+ def callable_config(self) -> Dict[str, Any]:
18
+ return self._config
19
+
20
+ @staticmethod
21
+ def is_callable_config(config: Dict[str, Any]) -> bool:
22
+ return CallableConsumerConfig.CALLBACK_ATTRIBUTE in config
23
+
24
+ def get_call_back_attribute(self) -> Optional[str]:
25
+ return self._config.get(CallableConsumerConfig.CALLBACK_ATTRIBUTE)
26
+
27
+ def _resolve_oauth_callback(self) -> None:
28
+ if not self.get_call_back_attribute():
29
+ return
30
+
31
+ call_back = self.get_call_back_attribute()
32
+
33
+ assert call_back # to silent lint
34
+ # Set the callback
35
+ self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = import_path(call_back)
@@ -16,8 +16,10 @@ from confluent_kafka.schema_registry.schema_registry_client import (
16
16
  from datahub.ingestion.extractor import protobuf_util, schema_util
17
17
  from datahub.ingestion.extractor.json_schema_util import JsonSchemaTranslator
18
18
  from datahub.ingestion.extractor.protobuf_util import ProtobufSchema
19
- from datahub.ingestion.source.kafka import KafkaSourceConfig, KafkaSourceReport
20
- from datahub.ingestion.source.kafka_schema_registry_base import KafkaSchemaRegistryBase
19
+ from datahub.ingestion.source.kafka.kafka import KafkaSourceConfig, KafkaSourceReport
20
+ from datahub.ingestion.source.kafka.kafka_schema_registry_base import (
21
+ KafkaSchemaRegistryBase,
22
+ )
21
23
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
22
24
  KafkaSchema,
23
25
  SchemaField,
File without changes
@@ -18,6 +18,7 @@ from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistr
18
18
 
19
19
  from datahub.configuration.common import AllowDenyPattern
20
20
  from datahub.configuration.kafka import KafkaConsumerConnectionConfig
21
+ from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
21
22
  from datahub.configuration.source_common import (
22
23
  DatasetSourceConfigMixin,
23
24
  LowerCaseDatasetUrnConfigMixin,
@@ -49,7 +50,9 @@ from datahub.ingestion.api.source import (
49
50
  )
50
51
  from datahub.ingestion.api.workunit import MetadataWorkUnit
51
52
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
52
- from datahub.ingestion.source.kafka_schema_registry_base import KafkaSchemaRegistryBase
53
+ from datahub.ingestion.source.kafka.kafka_schema_registry_base import (
54
+ KafkaSchemaRegistryBase,
55
+ )
53
56
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
54
57
  StaleEntityRemovalHandler,
55
58
  StaleEntityRemovalSourceReport,
@@ -143,7 +146,7 @@ class KafkaSourceConfig(
143
146
  def get_kafka_consumer(
144
147
  connection: KafkaConsumerConnectionConfig,
145
148
  ) -> confluent_kafka.Consumer:
146
- return confluent_kafka.Consumer(
149
+ consumer = confluent_kafka.Consumer(
147
150
  {
148
151
  "group.id": "test",
149
152
  "bootstrap.servers": connection.bootstrap,
@@ -151,6 +154,13 @@ def get_kafka_consumer(
151
154
  }
152
155
  )
153
156
 
157
+ if CallableConsumerConfig.is_callable_config(connection.consumer_config):
158
+ # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed
159
+ # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration
160
+ consumer.poll(timeout=30)
161
+
162
+ return consumer
163
+
154
164
 
155
165
  @dataclass
156
166
  class KafkaSourceReport(StaleEntityRemovalSourceReport):
@@ -101,6 +101,7 @@ class StoredProcedure:
101
101
  flow: Union[MSSQLJob, MSSQLProceduresContainer]
102
102
  type: str = "STORED_PROCEDURE"
103
103
  source: str = "mssql"
104
+ code: Optional[str] = None
104
105
 
105
106
  @property
106
107
  def full_type(self) -> str:
@@ -24,6 +24,8 @@ from datahub.ingestion.api.decorators import (
24
24
  platform_name,
25
25
  support_status,
26
26
  )
27
+ from datahub.ingestion.api.source import StructuredLogLevel
28
+ from datahub.ingestion.api.source_helpers import auto_workunit
27
29
  from datahub.ingestion.api.workunit import MetadataWorkUnit
28
30
  from datahub.ingestion.source.sql.mssql.job_models import (
29
31
  JobStep,
@@ -36,6 +38,9 @@ from datahub.ingestion.source.sql.mssql.job_models import (
36
38
  ProcedureParameter,
37
39
  StoredProcedure,
38
40
  )
41
+ from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import (
42
+ generate_procedure_lineage,
43
+ )
39
44
  from datahub.ingestion.source.sql.sql_common import (
40
45
  SQLAlchemySource,
41
46
  SqlWorkUnit,
@@ -51,6 +56,7 @@ from datahub.metadata.schema_classes import (
51
56
  StringTypeClass,
52
57
  UnionTypeClass,
53
58
  )
59
+ from datahub.utilities.file_backed_collections import FileBackedList
54
60
 
55
61
  logger: logging.Logger = logging.getLogger(__name__)
56
62
 
@@ -99,6 +105,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
99
105
  default=False,
100
106
  description="Enable to convert the SQL Server assets urns to lowercase",
101
107
  )
108
+ include_lineage: bool = Field(
109
+ default=True,
110
+ description="Enable lineage extraction for stored procedures",
111
+ )
102
112
 
103
113
  @pydantic.validator("uri_args")
104
114
  def passwords_match(cls, v, values, **kwargs):
@@ -161,6 +171,7 @@ class SQLServerSource(SQLAlchemySource):
161
171
  self.current_database = None
162
172
  self.table_descriptions: Dict[str, str] = {}
163
173
  self.column_descriptions: Dict[str, str] = {}
174
+ self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
164
175
  if self.config.include_descriptions:
165
176
  for inspector in self.get_inspectors():
166
177
  db_name: str = self.get_db_name(inspector)
@@ -374,7 +385,7 @@ class SQLServerSource(SQLAlchemySource):
374
385
  def loop_job_steps(
375
386
  self, job: MSSQLJob, job_steps: Dict[str, Any]
376
387
  ) -> Iterable[MetadataWorkUnit]:
377
- for step_id, step_data in job_steps.items():
388
+ for _step_id, step_data in job_steps.items():
378
389
  step = JobStep(
379
390
  job_name=job.formatted_name,
380
391
  step_name=step_data["step_name"],
@@ -412,37 +423,44 @@ class SQLServerSource(SQLAlchemySource):
412
423
  if procedures:
413
424
  yield from self.construct_flow_workunits(data_flow=data_flow)
414
425
  for procedure in procedures:
415
- upstream = self._get_procedure_upstream(conn, procedure)
416
- downstream = self._get_procedure_downstream(conn, procedure)
417
- data_job = MSSQLDataJob(
418
- entity=procedure,
419
- )
420
- # TODO: because of this upstream and downstream are more dependencies,
421
- # can't be used as DataJobInputOutput.
422
- # Should be reorganized into lineage.
423
- data_job.add_property("procedure_depends_on", str(upstream.as_property))
424
- data_job.add_property(
425
- "depending_on_procedure", str(downstream.as_property)
426
- )
427
- procedure_definition, procedure_code = self._get_procedure_code(
428
- conn, procedure
429
- )
430
- if procedure_definition:
431
- data_job.add_property("definition", procedure_definition)
432
- if sql_config.include_stored_procedures_code and procedure_code:
433
- data_job.add_property("code", procedure_code)
434
- procedure_inputs = self._get_procedure_inputs(conn, procedure)
435
- properties = self._get_procedure_properties(conn, procedure)
436
- data_job.add_property(
437
- "input parameters", str([param.name for param in procedure_inputs])
438
- )
439
- for param in procedure_inputs:
440
- data_job.add_property(
441
- f"parameter {param.name}", str(param.properties)
442
- )
443
- for property_name, property_value in properties.items():
444
- data_job.add_property(property_name, str(property_value))
445
- yield from self.construct_job_workunits(data_job)
426
+ yield from self._process_stored_procedure(conn, procedure)
427
+
428
+ def _process_stored_procedure(
429
+ self, conn: Connection, procedure: StoredProcedure
430
+ ) -> Iterable[MetadataWorkUnit]:
431
+ upstream = self._get_procedure_upstream(conn, procedure)
432
+ downstream = self._get_procedure_downstream(conn, procedure)
433
+ data_job = MSSQLDataJob(
434
+ entity=procedure,
435
+ )
436
+ # TODO: because of this upstream and downstream are more dependencies,
437
+ # can't be used as DataJobInputOutput.
438
+ # Should be reorganized into lineage.
439
+ data_job.add_property("procedure_depends_on", str(upstream.as_property))
440
+ data_job.add_property("depending_on_procedure", str(downstream.as_property))
441
+ procedure_definition, procedure_code = self._get_procedure_code(conn, procedure)
442
+ procedure.code = procedure_code
443
+ if procedure_definition:
444
+ data_job.add_property("definition", procedure_definition)
445
+ if procedure_code and self.config.include_stored_procedures_code:
446
+ data_job.add_property("code", procedure_code)
447
+ procedure_inputs = self._get_procedure_inputs(conn, procedure)
448
+ properties = self._get_procedure_properties(conn, procedure)
449
+ data_job.add_property(
450
+ "input parameters", str([param.name for param in procedure_inputs])
451
+ )
452
+ for param in procedure_inputs:
453
+ data_job.add_property(f"parameter {param.name}", str(param.properties))
454
+ for property_name, property_value in properties.items():
455
+ data_job.add_property(property_name, str(property_value))
456
+ if self.config.include_lineage:
457
+ # These will be used to construct lineage
458
+ self.stored_procedures.append(procedure)
459
+ yield from self.construct_job_workunits(
460
+ data_job,
461
+ # For stored procedure lineage is ingested later
462
+ include_lineage=False,
463
+ )
446
464
 
447
465
  @staticmethod
448
466
  def _get_procedure_downstream(
@@ -546,8 +564,8 @@ class SQLServerSource(SQLAlchemySource):
546
564
  code_list.append(row["Text"])
547
565
  if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip():
548
566
  code_slice_index = index
549
- definition = "\n".join(code_list[:code_slice_index])
550
- code = "\n".join(code_list[code_slice_index:])
567
+ definition = "".join(code_list[:code_slice_index])
568
+ code = "".join(code_list[code_slice_index:])
551
569
  except ResourceClosedError:
552
570
  logger.warning(
553
571
  "Connection was closed from procedure '%s'",
@@ -602,16 +620,18 @@ class SQLServerSource(SQLAlchemySource):
602
620
  def construct_job_workunits(
603
621
  self,
604
622
  data_job: MSSQLDataJob,
623
+ include_lineage: bool = True,
605
624
  ) -> Iterable[MetadataWorkUnit]:
606
625
  yield MetadataChangeProposalWrapper(
607
626
  entityUrn=data_job.urn,
608
627
  aspect=data_job.as_datajob_info_aspect,
609
628
  ).as_workunit()
610
629
 
611
- yield MetadataChangeProposalWrapper(
612
- entityUrn=data_job.urn,
613
- aspect=data_job.as_datajob_input_output_aspect,
614
- ).as_workunit()
630
+ if include_lineage:
631
+ yield MetadataChangeProposalWrapper(
632
+ entityUrn=data_job.urn,
633
+ aspect=data_job.as_datajob_input_output_aspect,
634
+ ).as_workunit()
615
635
  # TODO: Add SubType when it appear
616
636
 
617
637
  def construct_flow_workunits(
@@ -664,3 +684,58 @@ class SQLServerSource(SQLAlchemySource):
664
684
  if self.config.convert_urns_to_lowercase
665
685
  else qualified_table_name
666
686
  )
687
+
688
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
689
+ yield from super().get_workunits_internal()
690
+
691
+ # This is done at the end so that we will have access to tables
692
+ # from all databases in schema_resolver and discovered_tables
693
+ for procedure in self.stored_procedures:
694
+ with self.report.report_exc(
695
+ message="Failed to parse stored procedure lineage",
696
+ context=procedure.full_name,
697
+ level=StructuredLogLevel.WARN,
698
+ ):
699
+ yield from auto_workunit(
700
+ generate_procedure_lineage(
701
+ schema_resolver=self.schema_resolver,
702
+ procedure=procedure,
703
+ procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
704
+ is_temp_table=self.is_temp_table,
705
+ )
706
+ )
707
+
708
+ def is_temp_table(self, name: str) -> bool:
709
+ try:
710
+ parts = name.split(".")
711
+ table_name = parts[-1]
712
+ schema_name = parts[-2]
713
+ db_name = parts[-3]
714
+
715
+ if table_name.startswith("#"):
716
+ return True
717
+
718
+ # This is also a temp table if
719
+ # 1. this name would be allowed by the dataset patterns, and
720
+ # 2. we have a list of discovered tables, and
721
+ # 3. it's not in the discovered tables list
722
+ if (
723
+ self.config.database_pattern.allowed(db_name)
724
+ and self.config.schema_pattern.allowed(schema_name)
725
+ and self.config.table_pattern.allowed(name)
726
+ and self.standardize_identifier_case(name)
727
+ not in self.discovered_datasets
728
+ ):
729
+ logger.debug(f"inferred as temp table {name}")
730
+ return True
731
+
732
+ except Exception:
733
+ logger.warning(f"Error parsing table name {name} ")
734
+ return False
735
+
736
+ def standardize_identifier_case(self, table_ref_str: str) -> str:
737
+ return (
738
+ table_ref_str.lower()
739
+ if self.config.convert_urns_to_lowercase
740
+ else table_ref_str
741
+ )
@@ -0,0 +1,84 @@
1
+ import logging
2
+ from typing import Callable, Iterable, Optional
3
+
4
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
5
+ from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure
6
+ from datahub.metadata.schema_classes import DataJobInputOutputClass
7
+ from datahub.sql_parsing.datajob import to_datajob_input_output
8
+ from datahub.sql_parsing.schema_resolver import SchemaResolver
9
+ from datahub.sql_parsing.split_statements import split_statements
10
+ from datahub.sql_parsing.sql_parsing_aggregator import (
11
+ ObservedQuery,
12
+ SqlParsingAggregator,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def parse_procedure_code(
19
+ *,
20
+ schema_resolver: SchemaResolver,
21
+ default_db: Optional[str],
22
+ default_schema: Optional[str],
23
+ code: str,
24
+ is_temp_table: Callable[[str], bool],
25
+ raise_: bool = False,
26
+ ) -> Optional[DataJobInputOutputClass]:
27
+ aggregator = SqlParsingAggregator(
28
+ platform=schema_resolver.platform,
29
+ env=schema_resolver.env,
30
+ schema_resolver=schema_resolver,
31
+ generate_lineage=True,
32
+ generate_queries=False,
33
+ generate_usage_statistics=False,
34
+ generate_operations=False,
35
+ generate_query_subject_fields=False,
36
+ generate_query_usage_statistics=False,
37
+ is_temp_table=is_temp_table,
38
+ )
39
+ for query in split_statements(code):
40
+ # TODO: We should take into account `USE x` statements.
41
+ aggregator.add_observed_query(
42
+ observed=ObservedQuery(
43
+ default_db=default_db,
44
+ default_schema=default_schema,
45
+ query=query,
46
+ )
47
+ )
48
+ if aggregator.report.num_observed_queries_failed and raise_:
49
+ logger.info(aggregator.report.as_string())
50
+ raise ValueError(
51
+ f"Failed to parse {aggregator.report.num_observed_queries_failed} queries."
52
+ )
53
+
54
+ mcps = list(aggregator.gen_metadata())
55
+ return to_datajob_input_output(
56
+ mcps=mcps,
57
+ ignore_extra_mcps=True,
58
+ )
59
+
60
+
61
+ # Is procedure handling generic enough to be added to SqlParsingAggregator?
62
+ def generate_procedure_lineage(
63
+ *,
64
+ schema_resolver: SchemaResolver,
65
+ procedure: StoredProcedure,
66
+ procedure_job_urn: str,
67
+ is_temp_table: Callable[[str], bool] = lambda _: False,
68
+ raise_: bool = False,
69
+ ) -> Iterable[MetadataChangeProposalWrapper]:
70
+ if procedure.code:
71
+ datajob_input_output = parse_procedure_code(
72
+ schema_resolver=schema_resolver,
73
+ default_db=procedure.db,
74
+ default_schema=procedure.schema,
75
+ code=procedure.code,
76
+ is_temp_table=is_temp_table,
77
+ raise_=raise_,
78
+ )
79
+
80
+ if datajob_input_output:
81
+ yield MetadataChangeProposalWrapper(
82
+ entityUrn=procedure_job_urn,
83
+ aspect=datajob_input_output,
84
+ )
@@ -392,6 +392,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
392
392
  platform_instance=self.config.platform_instance,
393
393
  env=self.config.env,
394
394
  )
395
+ self.discovered_datasets: Set[str] = set()
395
396
  self._view_definition_cache: MutableMapping[str, str]
396
397
  if self.config.use_file_backed_cache:
397
398
  self._view_definition_cache = FileBackedDict[str]()
@@ -831,8 +832,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
831
832
  self._classify(dataset_name, schema, table, data_reader, schema_metadata)
832
833
 
833
834
  dataset_snapshot.aspects.append(schema_metadata)
834
- if self.config.include_view_lineage:
835
+ if self._save_schema_to_resolver():
835
836
  self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
837
+ self.discovered_datasets.add(dataset_name)
836
838
  db_name = self.get_db_name(inspector)
837
839
 
838
840
  yield from self.add_table_to_schema_container(
@@ -1126,8 +1128,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1126
1128
  columns,
1127
1129
  canonical_schema=schema_fields,
1128
1130
  )
1129
- if self.config.include_view_lineage:
1131
+ if self._save_schema_to_resolver():
1130
1132
  self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
1133
+ self.discovered_datasets.add(dataset_name)
1131
1134
  description, properties, _ = self.get_table_properties(inspector, schema, view)
1132
1135
  try:
1133
1136
  view_definition = inspector.get_view_definition(view, schema)
@@ -1190,6 +1193,11 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1190
1193
  domain_registry=self.domain_registry,
1191
1194
  )
1192
1195
 
1196
+ def _save_schema_to_resolver(self):
1197
+ return self.config.include_view_lineage or (
1198
+ hasattr(self.config, "include_lineage") and self.config.include_lineage
1199
+ )
1200
+
1193
1201
  def _run_sql_parser(
1194
1202
  self, view_identifier: str, query: str, schema_resolver: SchemaResolver
1195
1203
  ) -> Optional[SqlParsingResult]: