PyPI - acryl-datahub - Versions diffs - 0.14.1.13rc3__py3-none-any.whl → 0.14.1.13rc5__py3-none-any.whl - Mend

acryl-datahub 0.14.1.13rc3py3-none-any.whl → 0.14.1.13rc5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (25) hide show

{acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-datahub/__init__.py,sha256=P9sbPaOoA3VjmcHU2vfWz0jZKxxkuH45Iab7XNKqP2Q,577
+datahub/__init__.py,sha256=sacWO6qm2tPLBhc25GFoFnt_AeiT0Qk9ZiibpHJbPhQ,577
 datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
 datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
 datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -96,7 +96,8 @@ datahub/configuration/datetimes.py,sha256=nayNc0mmlVKH6oVv9ud6C1dDUiZPGabW-YZxvr
 datahub/configuration/git.py,sha256=s55eUHxKqVZgtVsISaDyS-1F4iZBiybbjYsjbp5LU5o,6135
 datahub/configuration/import_resolver.py,sha256=b4Ie9L7knN1LALEVMxTcNFSklDD6CVE-4Ipy4ZYhNYA,369
 datahub/configuration/json_loader.py,sha256=vIDnjwXWi9yHDO8KW64EupOzOb_sspehGCD7xGHzg84,302
-datahub/configuration/kafka.py,sha256=GFHXnTyDAEHhpxnVUOepQ7aKYZOd9mTQyx1-MM5m88A,2126
+datahub/configuration/kafka.py,sha256=MlIwpd5FFyOyjdDXW_X9JTLNk7f988sPMgevkcZYVgI,2579
+datahub/configuration/kafka_consumer_config.py,sha256=DSwUU4HoqNyK4CNk9eIbX3eYsJMGQvORDiy1ZxkjlRc,1022
 datahub/configuration/pattern_utils.py,sha256=Q5IB9RfWOOo5FvRVBU7XkhiwHCxSQ1NTMfUlWtWI9qc,699
 datahub/configuration/pydantic_migration_helpers.py,sha256=4C_COAVZ5iJ8yxcWNgXZNWsY7ULogICNZ368oNF7zWg,1462
 datahub/configuration/source_common.py,sha256=68LZOuB23zSEcfgQJE1wZQnyYQHVVnEZK3Sniv_nEQs,2107
@@ -182,7 +183,7 @@ datahub/ingestion/sink/datahub_rest.py,sha256=17MjPW4F6LwLjZyATIWdDdIYvC74VH6g1j
 datahub/ingestion/sink/file.py,sha256=SxXJPJpkIGoaqRjCcSmj2ZE3xE4rLlBABBGwpTj5LWI,3271
 datahub/ingestion/sink/sink_registry.py,sha256=JRBWx8qEYg0ubSTyhqwgSWctgxwyp6fva9GoN2LwBao,490
 datahub/ingestion/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source/confluent_schema_registry.py,sha256=K2LrEKCukw-kqa2AHsRE7MQYS4A416RY5YtpJt4esvg,18783
+datahub/ingestion/source/confluent_schema_registry.py,sha256=_h9D8bUXoaGcwgwB94dX6aTyLY5ve7XGdcVFSJHGSJc,18804
 datahub/ingestion/source/csv_enricher.py,sha256=xjCbcsSMM8l_ASCRAnNsUGKuYMrD1lec19Waixub1EM,29498
 datahub/ingestion/source/demo_data.py,sha256=yzA_R-wfSX2WPz0i5ukYlscpmpb0Pt8D7EkhtKfftvo,1286
 datahub/ingestion/source/elastic_search.py,sha256=qFUVNzynTVJTabASTjGMu8Qhf9UpNbEtSBFjaPQjBJE,22641
@@ -191,9 +192,6 @@ datahub/ingestion/source/file.py,sha256=pH-Qkjh5FQ2XvyYPE7Z8XEY4vUk_SUHxm8p8IxG1
 datahub/ingestion/source/ge_data_profiler.py,sha256=7oUvvADX4t1WiXwOruCURh3sNEY_7I41wbwXFvKaKWM,63587
 datahub/ingestion/source/ge_profiling_config.py,sha256=WusEMGFPu17y99jTT-1tTOiN87Q1sY7nyxXvXnPs1-E,10489
 datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
-datahub/ingestion/source/kafka.py,sha256=lGTkbwPvi8ysa0HaPaoWVHYblsPLB1js92cHbj2keKY,24876
-datahub/ingestion/source/kafka_connect.py,sha256=5KUlhn3876c41Z3kx5l4oJhbu0ekXZQRdxmu52vb_v8,55167
-datahub/ingestion/source/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
 datahub/ingestion/source/ldap.py,sha256=Vnzg8tpwBYeyM-KBVVsUJvGZGBMJiCJ_i_FhxaFRQ9A,18627
 datahub/ingestion/source/metabase.py,sha256=oemiMdzjfr82Hx6rdwTNBzFM8962LDkosYh7SD_I5cY,31717
 datahub/ingestion/source/mlflow.py,sha256=SxCt4jtxQcpPWEI2rRNagCiE_6TWr2RroqmxRd_td1Y,11565
@@ -320,6 +318,10 @@ datahub/ingestion/source/iceberg/iceberg_profiler.py,sha256=hLT1Le_TEUoFXvsJSlrR
 datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/identity/azure_ad.py,sha256=GdmJFD4UMsb5353Z7phXRf-YsXR2woGLRJwBXUkgXq0,28809
 datahub/ingestion/source/identity/okta.py,sha256=PnRokWLG8wSoNZlXJiRZiW6APTEHO09q4n2j_l6m3V0,30756
+datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+datahub/ingestion/source/kafka/kafka.py,sha256=wC5tZ31fvXWLjse9qH3YONPhMxXIZ9QV9vbBcWDAP_w,25352
+datahub/ingestion/source/kafka/kafka_connect.py,sha256=5KUlhn3876c41Z3kx5l4oJhbu0ekXZQRdxmu52vb_v8,55167
+datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
 datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/looker/lkml_patched.py,sha256=XShEU7Wbz0DubDhYMjKf9wjKZrBJa2XPg9MIjp8rPhk,733
 datahub/ingestion/source/looker/looker_common.py,sha256=cIrsc0nDEsODxx9tjvSm2A3-OlCkPKCu4W2L8PxzLWs,59926
@@ -447,7 +449,7 @@ datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_
 datahub/ingestion/source/sql/oracle.py,sha256=exJJvMSC42Oj0IiFtpAPFatWkRu9mVu8sd7ofOdanqU,22460
 datahub/ingestion/source/sql/postgres.py,sha256=uC1kYEI8VdxiZ1Y9IxMWzwmg11wtMqYN0e2fkok1rxo,11972
 datahub/ingestion/source/sql/presto.py,sha256=PB-CS5MX2dSRFRHjlxfkLHGXLZXFNCsVAAyRBtY6HMg,3611
-datahub/ingestion/source/sql/sql_common.py,sha256=SZkPK7aUiTdMHf5alG5724i1pyAn1jFUjcuDkKUoeL0,51813
+datahub/ingestion/source/sql/sql_common.py,sha256=Qe481Pct1vfl-NVpgOufSEaNSMgJGD3bq1DPvjIkPTg,52164
 datahub/ingestion/source/sql/sql_config.py,sha256=M-l_uXau0ODolLZHBzAXhy-Rq5yYxvJ6cLbCIea7Mww,9449
 datahub/ingestion/source/sql/sql_generic.py,sha256=9AERvkK8kdJUeDOzCYJDb93xdv6Z4DGho0NfeHj5Uyg,2740
 datahub/ingestion/source/sql/sql_generic_profiler.py,sha256=vWna43rv9ClfK9MBNTx8tdoUr35nD5d8ppgeamEEhSQ,12528
@@ -460,8 +462,9 @@ datahub/ingestion/source/sql/trino.py,sha256=FEn_BQ3pm23hKx94ek5kk5IXGNYcBqZEhll
 datahub/ingestion/source/sql/two_tier_sql_source.py,sha256=YDrGBb5WKVls6qv17QU5foKrf71SydzEltc3WsVAhQc,5732
 datahub/ingestion/source/sql/vertica.py,sha256=pkx-1JDBmow7WhoEIEh0SLj7kff9L0zUOHDytoC43gk,33339
 datahub/ingestion/source/sql/mssql/__init__.py,sha256=1agpl8S_uDW40olkhCX_W19dbr5GO9qgjS3R7pLRZSk,87
-datahub/ingestion/source/sql/mssql/job_models.py,sha256=-wDCltoBAFHaydAV4Gs0vm-j7qKNCH5iEJOVWanhlZw,5981
-datahub/ingestion/source/sql/mssql/source.py,sha256=_qenSWn5mPrr5EPDB6Qbt9EOURqesK68Fm_pnh9sO58,26594
+datahub/ingestion/source/sql/mssql/job_models.py,sha256=eMyR0Efl5kvi7QNgNXzd5_6PdDKYly_552Y8OGSj9PY,6012
+datahub/ingestion/source/sql/mssql/source.py,sha256=fzpWjwexGvJgpd6Z4DCsK6Ld2vQCfPkD2M1xE4pU9Ec,29542
+datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py,sha256=RpnvKPalAAaOD_eUg8bZ4VkGTSeLFWuy0mefwc4s3x8,2837
 datahub/ingestion/source/state/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/state/checkpoint.py,sha256=x9Xww-MIFXSKjeg1tOZXE72LehCm5OfKy3HfucgIRWM,8833
 datahub/ingestion/source/state/entity_removal_state.py,sha256=zvIsmYg7oiIu2FhecU0VfLBNToUqvKoKyDeiFfkOcyc,6611
@@ -856,13 +859,15 @@ datahub/specific/structured_property.py,sha256=IYeFyafPidNrDbn1sU65rEPwIZDS-wLY1
 datahub/sql_parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/sql_parsing/_models.py,sha256=il-xm1RcLdi1phJUV3xrTecdOGH31akqheuSC2N4YhQ,3141
 datahub/sql_parsing/_sqlglot_patch.py,sha256=iYJ8zOThHqqbamD5jdNr9iHTWD7ewNeHzPiTb6-rO3Y,7043
-datahub/sql_parsing/query_types.py,sha256=AxoWZiLuXM6iZ2AXdSlD82gqPNr1ZFh78wn2MSTu2wY,2479
+datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn0,1751
+datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
 datahub/sql_parsing/schema_resolver.py,sha256=9wbJT80K4nsIHHOuQLso9QoFQAYwfSJZnqHwsaU3UTY,10197
-datahub/sql_parsing/sql_parsing_aggregator.py,sha256=YzYmWNQjXuQYcfh91ZnqjuzCDWdmZpE-ZX14Kz6XFMs,69938
-datahub/sql_parsing/sql_parsing_common.py,sha256=ptTwHXdAHuw0jbhIalEkB3mncDx5LsQeclRPecPzPFE,2339
+datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
+datahub/sql_parsing/sql_parsing_aggregator.py,sha256=gLelf5l73EufB8qijb9ZDLANkt4o05schGg4DY-bOJs,69937
+datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
 datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
-datahub/sql_parsing/sqlglot_lineage.py,sha256=jVMYZ5S36rHALiGmefQSGliRVcafLrodmfBUrTu_-9A,44787
-datahub/sql_parsing/sqlglot_utils.py,sha256=nVFG0UnHibApwNyJMEjdnJxgHreR9p9lN0Z3X6vhbnU,14448
+datahub/sql_parsing/sqlglot_lineage.py,sha256=K532_zvj7vEt2Ci2d3kxYZLZkq4S0ECBRZ2nHuY11a4,45927
+datahub/sql_parsing/sqlglot_utils.py,sha256=8MYzkyekhup3ihVStRPuwneWPNu17xhBg5SG8iVfFRY,14431
 datahub/sql_parsing/tool_meta_extractor.py,sha256=pE-pkRKBfNTXEJkaQM9NlG807mc-X6OtetgskJySCs8,2908
 datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/telemetry/stats.py,sha256=YltbtC3fe6rl1kcxn1A-mSnVpECTPm5k-brrUt7QxTI,967
@@ -888,7 +893,7 @@ datahub/utilities/dedup_list.py,sha256=dUSpe1AajfuwlHVJKNv-CzDXSCkaw0HgSMOsxqUkQ
 datahub/utilities/delayed_iter.py,sha256=XlsI0DCXkVVejFKOW_uMT0E8DTqqOHQN3Ooak4EcULE,645
 datahub/utilities/delta.py,sha256=hkpF8W7Lvg2gUJBQR3mmIzOxsRQ6i5cchRPFlAVoV10,1128
 datahub/utilities/docs_build.py,sha256=uFMK3z1d4BExpsrvguHunidbEDAzQ8hoOP7iQ0A_IVw,211
-datahub/utilities/file_backed_collections.py,sha256=5Og8AJV5WxbY7N8QOQWcRh8aNa7C3zrFKyAMaHddRmE,20232
+datahub/utilities/file_backed_collections.py,sha256=tS6hN0kxMaaTb8rB-vDqAd973mTKshERSC55JIe_3Cw,20557
 datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
 datahub/utilities/hive_schema_to_avro.py,sha256=2-7NI9haCAYbyUmHTb-QPxjn4WbmnDDKAIGmHJ11-1E,11622
 datahub/utilities/is_pytest.py,sha256=2m9T4S9IIKhI5RfTqrB2ZmumzHocdxBHpM1HroWj2XQ,138
@@ -965,8 +970,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
 datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
 datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
 datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
-acryl_datahub-0.14.1.13rc3.dist-info/METADATA,sha256=TighbabeG0cW7RwOPQ1AvwAcwZeZ4LWaGRzKrKDY_vE,171138
-acryl_datahub-0.14.1.13rc3.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-acryl_datahub-0.14.1.13rc3.dist-info/entry_points.txt,sha256=tOZmkU-DL9O6-BRhkwS095b_y2RlSKQ7j0hvp57iw1Q,9420
-acryl_datahub-0.14.1.13rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
-acryl_datahub-0.14.1.13rc3.dist-info/RECORD,,
+acryl_datahub-0.14.1.13rc5.dist-info/METADATA,sha256=9ZZBowQ_PYKf6d4V6EHqTTlJDY77BqxSX8uKfiIqFT4,171138
+acryl_datahub-0.14.1.13rc5.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+acryl_datahub-0.14.1.13rc5.dist-info/entry_points.txt,sha256=VcQx0dnqaYLyeY_L5OaX7bLmmE-Il7TAXkxCKvEn2bA,9432
+acryl_datahub-0.14.1.13rc5.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
+acryl_datahub-0.14.1.13rc5.dist-info/RECORD,,

{acryl_datahub-0.14.1.13rc3.dist-info → acryl_datahub-0.14.1.13rc5.dist-info}/entry_points.txt RENAMED Viewed

@@ -56,8 +56,8 @@ hive = datahub.ingestion.source.sql.hive:HiveSource
 hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource
 iceberg = datahub.ingestion.source.iceberg.iceberg:IcebergSource
 json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource
-kafka = datahub.ingestion.source.kafka:KafkaSource
-kafka-connect = datahub.ingestion.source.kafka_connect:KafkaConnectSource
+kafka = datahub.ingestion.source.kafka.kafka:KafkaSource
+kafka-connect = datahub.ingestion.source.kafka.kafka_connect:KafkaConnectSource
 ldap = datahub.ingestion.source.ldap:LDAPSource
 looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource
 lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource

datahub/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ import warnings
 # Published at https://pypi.org/project/acryl-datahub/.
 __package_name__ = "acryl-datahub"
-__version__ = "0.14.1.13rc3"
+__version__ = "0.14.1.13rc5"
 def is_dev_mode() -> bool:

datahub/configuration/kafka.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from pydantic import Field, validator
-from datahub.configuration.common import ConfigModel
+from datahub.configuration.common import ConfigModel, ConfigurationError
+from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
 from datahub.configuration.validate_host_port import validate_host_port
@@ -36,6 +37,16 @@ class KafkaConsumerConnectionConfig(_KafkaConnectionConfig):
         description="Extra consumer config serialized as JSON. These options will be passed into Kafka's DeserializingConsumer. See https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer and https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md .",
     )
+    @validator("consumer_config")
+    @classmethod
+    def resolve_callback(cls, value: dict) -> dict:
+        if CallableConsumerConfig.is_callable_config(value):
+            try:
+                value = CallableConsumerConfig(value).callable_config()
+            except Exception as e:
+                raise ConfigurationError(e)
+        return value
 class KafkaProducerConnectionConfig(_KafkaConnectionConfig):
     """Configuration class for holding connectivity information for Kafka producers"""

datahub/configuration/kafka_consumer_config.py ADDED Viewed

@@ -0,0 +1,35 @@
+import logging
+from typing import Any, Dict, Optional
+from datahub.ingestion.api.registry import import_path
+logger = logging.getLogger(__name__)
+class CallableConsumerConfig:
+    CALLBACK_ATTRIBUTE: str = "oauth_cb"
+    def __init__(self, config: Dict[str, Any]):
+        self._config = config
+        self._resolve_oauth_callback()
+    def callable_config(self) -> Dict[str, Any]:
+        return self._config
+    @staticmethod
+    def is_callable_config(config: Dict[str, Any]) -> bool:
+        return CallableConsumerConfig.CALLBACK_ATTRIBUTE in config
+    def get_call_back_attribute(self) -> Optional[str]:
+        return self._config.get(CallableConsumerConfig.CALLBACK_ATTRIBUTE)
+    def _resolve_oauth_callback(self) -> None:
+        if not self.get_call_back_attribute():
+            return
+        call_back = self.get_call_back_attribute()
+        assert call_back  # to silent lint
+        # Set the callback
+        self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = import_path(call_back)

datahub/ingestion/source/confluent_schema_registry.py CHANGED Viewed

@@ -16,8 +16,10 @@ from confluent_kafka.schema_registry.schema_registry_client import (
 from datahub.ingestion.extractor import protobuf_util, schema_util
 from datahub.ingestion.extractor.json_schema_util import JsonSchemaTranslator
 from datahub.ingestion.extractor.protobuf_util import ProtobufSchema
-from datahub.ingestion.source.kafka import KafkaSourceConfig, KafkaSourceReport
-from datahub.ingestion.source.kafka_schema_registry_base import KafkaSchemaRegistryBase
+from datahub.ingestion.source.kafka.kafka import KafkaSourceConfig, KafkaSourceReport
+from datahub.ingestion.source.kafka.kafka_schema_registry_base import (
+    KafkaSchemaRegistryBase,
+)
 from datahub.metadata.com.linkedin.pegasus2avro.schema import (
     KafkaSchema,
     SchemaField,

datahub/ingestion/source/kafka/__init__.py ADDED Viewed

File without changes

datahub/ingestion/source/{kafka.py → kafka/kafka.py} RENAMED Viewed

@@ -18,6 +18,7 @@ from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistr
 from datahub.configuration.common import AllowDenyPattern
 from datahub.configuration.kafka import KafkaConsumerConnectionConfig
+from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
 from datahub.configuration.source_common import (
     DatasetSourceConfigMixin,
     LowerCaseDatasetUrnConfigMixin,
@@ -49,7 +50,9 @@ from datahub.ingestion.api.source import (
 )
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.common.subtypes import DatasetSubTypes
-from datahub.ingestion.source.kafka_schema_registry_base import KafkaSchemaRegistryBase
+from datahub.ingestion.source.kafka.kafka_schema_registry_base import (
+    KafkaSchemaRegistryBase,
+)
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
     StaleEntityRemovalSourceReport,
@@ -143,7 +146,7 @@ class KafkaSourceConfig(
 def get_kafka_consumer(
     connection: KafkaConsumerConnectionConfig,
 ) -> confluent_kafka.Consumer:
-    return confluent_kafka.Consumer(
+    consumer = confluent_kafka.Consumer(
         {
             "group.id": "test",
             "bootstrap.servers": connection.bootstrap,
@@ -151,6 +154,13 @@ def get_kafka_consumer(
         }
     )
+    if CallableConsumerConfig.is_callable_config(connection.consumer_config):
+        # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed
+        # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration
+        consumer.poll(timeout=30)
+    return consumer
 @dataclass
 class KafkaSourceReport(StaleEntityRemovalSourceReport):

datahub/ingestion/source/sql/mssql/job_models.py CHANGED Viewed

@@ -101,6 +101,7 @@ class StoredProcedure:
     flow: Union[MSSQLJob, MSSQLProceduresContainer]
     type: str = "STORED_PROCEDURE"
     source: str = "mssql"
+    code: Optional[str] = None
     @property
     def full_type(self) -> str:

datahub/ingestion/source/sql/mssql/source.py CHANGED Viewed

@@ -24,6 +24,8 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
+from datahub.ingestion.api.source import StructuredLogLevel
+from datahub.ingestion.api.source_helpers import auto_workunit
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.sql.mssql.job_models import (
     JobStep,
@@ -36,6 +38,9 @@ from datahub.ingestion.source.sql.mssql.job_models import (
     ProcedureParameter,
     StoredProcedure,
 )
+from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import (
+    generate_procedure_lineage,
+)
 from datahub.ingestion.source.sql.sql_common import (
     SQLAlchemySource,
     SqlWorkUnit,
@@ -51,6 +56,7 @@ from datahub.metadata.schema_classes import (
     StringTypeClass,
     UnionTypeClass,
 )
+from datahub.utilities.file_backed_collections import FileBackedList
 logger: logging.Logger = logging.getLogger(__name__)
@@ -99,6 +105,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
         default=False,
         description="Enable to convert the SQL Server assets urns to lowercase",
     )
+    include_lineage: bool = Field(
+        default=True,
+        description="Enable lineage extraction for stored procedures",
+    )
     @pydantic.validator("uri_args")
     def passwords_match(cls, v, values, **kwargs):
@@ -161,6 +171,7 @@ class SQLServerSource(SQLAlchemySource):
         self.current_database = None
         self.table_descriptions: Dict[str, str] = {}
         self.column_descriptions: Dict[str, str] = {}
+        self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
         if self.config.include_descriptions:
             for inspector in self.get_inspectors():
                 db_name: str = self.get_db_name(inspector)
@@ -374,7 +385,7 @@ class SQLServerSource(SQLAlchemySource):
     def loop_job_steps(
         self, job: MSSQLJob, job_steps: Dict[str, Any]
     ) -> Iterable[MetadataWorkUnit]:
-        for step_id, step_data in job_steps.items():
+        for _step_id, step_data in job_steps.items():
             step = JobStep(
                 job_name=job.formatted_name,
                 step_name=step_data["step_name"],
@@ -412,37 +423,44 @@ class SQLServerSource(SQLAlchemySource):
             if procedures:
                 yield from self.construct_flow_workunits(data_flow=data_flow)
             for procedure in procedures:
-                upstream = self._get_procedure_upstream(conn, procedure)
-                downstream = self._get_procedure_downstream(conn, procedure)
-                data_job = MSSQLDataJob(
-                    entity=procedure,
-                )
-                # TODO: because of this upstream and downstream are more dependencies,
-                #  can't be used as DataJobInputOutput.
-                #  Should be reorganized into lineage.
-                data_job.add_property("procedure_depends_on", str(upstream.as_property))
-                data_job.add_property(
-                    "depending_on_procedure", str(downstream.as_property)
-                )
-                procedure_definition, procedure_code = self._get_procedure_code(
-                    conn, procedure
-                )
-                if procedure_definition:
-                    data_job.add_property("definition", procedure_definition)
-                if sql_config.include_stored_procedures_code and procedure_code:
-                    data_job.add_property("code", procedure_code)
-                procedure_inputs = self._get_procedure_inputs(conn, procedure)
-                properties = self._get_procedure_properties(conn, procedure)
-                data_job.add_property(
-                    "input parameters", str([param.name for param in procedure_inputs])
-                )
-                for param in procedure_inputs:
-                    data_job.add_property(
-                        f"parameter {param.name}", str(param.properties)
-                    )
-                for property_name, property_value in properties.items():
-                    data_job.add_property(property_name, str(property_value))
-                yield from self.construct_job_workunits(data_job)
+                yield from self._process_stored_procedure(conn, procedure)
+    def _process_stored_procedure(
+        self, conn: Connection, procedure: StoredProcedure
+    ) -> Iterable[MetadataWorkUnit]:
+        upstream = self._get_procedure_upstream(conn, procedure)
+        downstream = self._get_procedure_downstream(conn, procedure)
+        data_job = MSSQLDataJob(
+            entity=procedure,
+        )
+        # TODO: because of this upstream and downstream are more dependencies,
+        #  can't be used as DataJobInputOutput.
+        #  Should be reorganized into lineage.
+        data_job.add_property("procedure_depends_on", str(upstream.as_property))
+        data_job.add_property("depending_on_procedure", str(downstream.as_property))
+        procedure_definition, procedure_code = self._get_procedure_code(conn, procedure)
+        procedure.code = procedure_code
+        if procedure_definition:
+            data_job.add_property("definition", procedure_definition)
+        if procedure_code and self.config.include_stored_procedures_code:
+            data_job.add_property("code", procedure_code)
+        procedure_inputs = self._get_procedure_inputs(conn, procedure)
+        properties = self._get_procedure_properties(conn, procedure)
+        data_job.add_property(
+            "input parameters", str([param.name for param in procedure_inputs])
+        )
+        for param in procedure_inputs:
+            data_job.add_property(f"parameter {param.name}", str(param.properties))
+        for property_name, property_value in properties.items():
+            data_job.add_property(property_name, str(property_value))
+        if self.config.include_lineage:
+            # These will be used to construct lineage
+            self.stored_procedures.append(procedure)
+        yield from self.construct_job_workunits(
+            data_job,
+            # For stored procedure lineage is ingested later
+            include_lineage=False,
+        )
     @staticmethod
     def _get_procedure_downstream(
@@ -546,8 +564,8 @@ class SQLServerSource(SQLAlchemySource):
                 code_list.append(row["Text"])
                 if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip():
                     code_slice_index = index
-            definition = "\n".join(code_list[:code_slice_index])
-            code = "\n".join(code_list[code_slice_index:])
+            definition = "".join(code_list[:code_slice_index])
+            code = "".join(code_list[code_slice_index:])
         except ResourceClosedError:
             logger.warning(
                 "Connection was closed from procedure '%s'",
@@ -602,16 +620,18 @@ class SQLServerSource(SQLAlchemySource):
     def construct_job_workunits(
         self,
         data_job: MSSQLDataJob,
+        include_lineage: bool = True,
     ) -> Iterable[MetadataWorkUnit]:
         yield MetadataChangeProposalWrapper(
             entityUrn=data_job.urn,
             aspect=data_job.as_datajob_info_aspect,
         ).as_workunit()
-        yield MetadataChangeProposalWrapper(
-            entityUrn=data_job.urn,
-            aspect=data_job.as_datajob_input_output_aspect,
-        ).as_workunit()
+        if include_lineage:
+            yield MetadataChangeProposalWrapper(
+                entityUrn=data_job.urn,
+                aspect=data_job.as_datajob_input_output_aspect,
+            ).as_workunit()
         # TODO: Add SubType when it appear
     def construct_flow_workunits(
@@ -664,3 +684,58 @@ class SQLServerSource(SQLAlchemySource):
             if self.config.convert_urns_to_lowercase
             else qualified_table_name
         )
+    def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        yield from super().get_workunits_internal()
+        # This is done at the end so that we will have access to tables
+        # from all databases in schema_resolver and discovered_tables
+        for procedure in self.stored_procedures:
+            with self.report.report_exc(
+                message="Failed to parse stored procedure lineage",
+                context=procedure.full_name,
+                level=StructuredLogLevel.WARN,
+            ):
+                yield from auto_workunit(
+                    generate_procedure_lineage(
+                        schema_resolver=self.schema_resolver,
+                        procedure=procedure,
+                        procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
+                        is_temp_table=self.is_temp_table,
+                    )
+                )
+    def is_temp_table(self, name: str) -> bool:
+        try:
+            parts = name.split(".")
+            table_name = parts[-1]
+            schema_name = parts[-2]
+            db_name = parts[-3]
+            if table_name.startswith("#"):
+                return True
+            # This is also a temp table if
+            #   1. this name would be allowed by the dataset patterns, and
+            #   2. we have a list of discovered tables, and
+            #   3. it's not in the discovered tables list
+            if (
+                self.config.database_pattern.allowed(db_name)
+                and self.config.schema_pattern.allowed(schema_name)
+                and self.config.table_pattern.allowed(name)
+                and self.standardize_identifier_case(name)
+                not in self.discovered_datasets
+            ):
+                logger.debug(f"inferred as temp table {name}")
+                return True
+        except Exception:
+            logger.warning(f"Error parsing table name {name} ")
+        return False
+    def standardize_identifier_case(self, table_ref_str: str) -> str:
+        return (
+            table_ref_str.lower()
+            if self.config.convert_urns_to_lowercase
+            else table_ref_str
+        )

datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py ADDED Viewed

@@ -0,0 +1,84 @@
+import logging
+from typing import Callable, Iterable, Optional
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure
+from datahub.metadata.schema_classes import DataJobInputOutputClass
+from datahub.sql_parsing.datajob import to_datajob_input_output
+from datahub.sql_parsing.schema_resolver import SchemaResolver
+from datahub.sql_parsing.split_statements import split_statements
+from datahub.sql_parsing.sql_parsing_aggregator import (
+    ObservedQuery,
+    SqlParsingAggregator,
+)
+logger = logging.getLogger(__name__)
+def parse_procedure_code(
+    *,
+    schema_resolver: SchemaResolver,
+    default_db: Optional[str],
+    default_schema: Optional[str],
+    code: str,
+    is_temp_table: Callable[[str], bool],
+    raise_: bool = False,
+) -> Optional[DataJobInputOutputClass]:
+    aggregator = SqlParsingAggregator(
+        platform=schema_resolver.platform,
+        env=schema_resolver.env,
+        schema_resolver=schema_resolver,
+        generate_lineage=True,
+        generate_queries=False,
+        generate_usage_statistics=False,
+        generate_operations=False,
+        generate_query_subject_fields=False,
+        generate_query_usage_statistics=False,
+        is_temp_table=is_temp_table,
+    )
+    for query in split_statements(code):
+        # TODO: We should take into account `USE x` statements.
+        aggregator.add_observed_query(
+            observed=ObservedQuery(
+                default_db=default_db,
+                default_schema=default_schema,
+                query=query,
+            )
+        )
+    if aggregator.report.num_observed_queries_failed and raise_:
+        logger.info(aggregator.report.as_string())
+        raise ValueError(
+            f"Failed to parse {aggregator.report.num_observed_queries_failed} queries."
+        )
+    mcps = list(aggregator.gen_metadata())
+    return to_datajob_input_output(
+        mcps=mcps,
+        ignore_extra_mcps=True,
+    )
+# Is procedure handling generic enough to be added to SqlParsingAggregator?
+def generate_procedure_lineage(
+    *,
+    schema_resolver: SchemaResolver,
+    procedure: StoredProcedure,
+    procedure_job_urn: str,
+    is_temp_table: Callable[[str], bool] = lambda _: False,
+    raise_: bool = False,
+) -> Iterable[MetadataChangeProposalWrapper]:
+    if procedure.code:
+        datajob_input_output = parse_procedure_code(
+            schema_resolver=schema_resolver,
+            default_db=procedure.db,
+            default_schema=procedure.schema,
+            code=procedure.code,
+            is_temp_table=is_temp_table,
+            raise_=raise_,
+        )
+        if datajob_input_output:
+            yield MetadataChangeProposalWrapper(
+                entityUrn=procedure_job_urn,
+                aspect=datajob_input_output,
+            )

datahub/ingestion/source/sql/sql_common.py CHANGED Viewed

@@ -392,6 +392,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             platform_instance=self.config.platform_instance,
             env=self.config.env,
         )
+        self.discovered_datasets: Set[str] = set()
         self._view_definition_cache: MutableMapping[str, str]
         if self.config.use_file_backed_cache:
             self._view_definition_cache = FileBackedDict[str]()
@@ -831,8 +832,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
         self._classify(dataset_name, schema, table, data_reader, schema_metadata)
         dataset_snapshot.aspects.append(schema_metadata)
-        if self.config.include_view_lineage:
+        if self._save_schema_to_resolver():
             self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
+            self.discovered_datasets.add(dataset_name)
         db_name = self.get_db_name(inspector)
         yield from self.add_table_to_schema_container(
@@ -1126,8 +1128,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
                 columns,
                 canonical_schema=schema_fields,
             )
-            if self.config.include_view_lineage:
+            if self._save_schema_to_resolver():
                 self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
+                self.discovered_datasets.add(dataset_name)
         description, properties, _ = self.get_table_properties(inspector, schema, view)
         try:
             view_definition = inspector.get_view_definition(view, schema)
@@ -1190,6 +1193,11 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
                 domain_registry=self.domain_registry,
             )
+    def _save_schema_to_resolver(self):
+        return self.config.include_view_lineage or (
+            hasattr(self.config, "include_lineage") and self.config.include_lineage
+        )
     def _run_sql_parser(
         self, view_identifier: str, query: str, schema_resolver: SchemaResolver
     ) -> Optional[SqlParsingResult]:

acryl-datahub 0.14.1.13rc3__py3-none-any.whl → 0.14.1.13rc5__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.14.1.13rc3py3-none-any.whl → 0.14.1.13rc5py3-none-any.whl