acryl-datahub 0.14.1.13rc2__py3-none-any.whl → 0.14.1.13rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=e3nkmi7bdUTg9Q8NH4-2X1R6P-S6P3c6Kcfa42TRdEs,577
1
+ datahub/__init__.py,sha256=WMIoTkprsOt2SjEA2gbvdoJ3VkTIeWCVPilQAqNdCQw,577
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -96,7 +96,8 @@ datahub/configuration/datetimes.py,sha256=nayNc0mmlVKH6oVv9ud6C1dDUiZPGabW-YZxvr
96
96
  datahub/configuration/git.py,sha256=s55eUHxKqVZgtVsISaDyS-1F4iZBiybbjYsjbp5LU5o,6135
97
97
  datahub/configuration/import_resolver.py,sha256=b4Ie9L7knN1LALEVMxTcNFSklDD6CVE-4Ipy4ZYhNYA,369
98
98
  datahub/configuration/json_loader.py,sha256=vIDnjwXWi9yHDO8KW64EupOzOb_sspehGCD7xGHzg84,302
99
- datahub/configuration/kafka.py,sha256=GFHXnTyDAEHhpxnVUOepQ7aKYZOd9mTQyx1-MM5m88A,2126
99
+ datahub/configuration/kafka.py,sha256=MlIwpd5FFyOyjdDXW_X9JTLNk7f988sPMgevkcZYVgI,2579
100
+ datahub/configuration/kafka_consumer_config.py,sha256=DSwUU4HoqNyK4CNk9eIbX3eYsJMGQvORDiy1ZxkjlRc,1022
100
101
  datahub/configuration/pattern_utils.py,sha256=Q5IB9RfWOOo5FvRVBU7XkhiwHCxSQ1NTMfUlWtWI9qc,699
101
102
  datahub/configuration/pydantic_migration_helpers.py,sha256=4C_COAVZ5iJ8yxcWNgXZNWsY7ULogICNZ368oNF7zWg,1462
102
103
  datahub/configuration/source_common.py,sha256=68LZOuB23zSEcfgQJE1wZQnyYQHVVnEZK3Sniv_nEQs,2107
@@ -182,7 +183,7 @@ datahub/ingestion/sink/datahub_rest.py,sha256=17MjPW4F6LwLjZyATIWdDdIYvC74VH6g1j
182
183
  datahub/ingestion/sink/file.py,sha256=SxXJPJpkIGoaqRjCcSmj2ZE3xE4rLlBABBGwpTj5LWI,3271
183
184
  datahub/ingestion/sink/sink_registry.py,sha256=JRBWx8qEYg0ubSTyhqwgSWctgxwyp6fva9GoN2LwBao,490
184
185
  datahub/ingestion/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
185
- datahub/ingestion/source/confluent_schema_registry.py,sha256=K2LrEKCukw-kqa2AHsRE7MQYS4A416RY5YtpJt4esvg,18783
186
+ datahub/ingestion/source/confluent_schema_registry.py,sha256=_h9D8bUXoaGcwgwB94dX6aTyLY5ve7XGdcVFSJHGSJc,18804
186
187
  datahub/ingestion/source/csv_enricher.py,sha256=xjCbcsSMM8l_ASCRAnNsUGKuYMrD1lec19Waixub1EM,29498
187
188
  datahub/ingestion/source/demo_data.py,sha256=yzA_R-wfSX2WPz0i5ukYlscpmpb0Pt8D7EkhtKfftvo,1286
188
189
  datahub/ingestion/source/elastic_search.py,sha256=qFUVNzynTVJTabASTjGMu8Qhf9UpNbEtSBFjaPQjBJE,22641
@@ -191,9 +192,6 @@ datahub/ingestion/source/file.py,sha256=pH-Qkjh5FQ2XvyYPE7Z8XEY4vUk_SUHxm8p8IxG1
191
192
  datahub/ingestion/source/ge_data_profiler.py,sha256=7oUvvADX4t1WiXwOruCURh3sNEY_7I41wbwXFvKaKWM,63587
192
193
  datahub/ingestion/source/ge_profiling_config.py,sha256=WusEMGFPu17y99jTT-1tTOiN87Q1sY7nyxXvXnPs1-E,10489
193
194
  datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
194
- datahub/ingestion/source/kafka.py,sha256=lGTkbwPvi8ysa0HaPaoWVHYblsPLB1js92cHbj2keKY,24876
195
- datahub/ingestion/source/kafka_connect.py,sha256=5KUlhn3876c41Z3kx5l4oJhbu0ekXZQRdxmu52vb_v8,55167
196
- datahub/ingestion/source/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
197
195
  datahub/ingestion/source/ldap.py,sha256=Vnzg8tpwBYeyM-KBVVsUJvGZGBMJiCJ_i_FhxaFRQ9A,18627
198
196
  datahub/ingestion/source/metabase.py,sha256=oemiMdzjfr82Hx6rdwTNBzFM8962LDkosYh7SD_I5cY,31717
199
197
  datahub/ingestion/source/mlflow.py,sha256=SxCt4jtxQcpPWEI2rRNagCiE_6TWr2RroqmxRd_td1Y,11565
@@ -320,6 +318,10 @@ datahub/ingestion/source/iceberg/iceberg_profiler.py,sha256=hLT1Le_TEUoFXvsJSlrR
320
318
  datahub/ingestion/source/identity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
321
319
  datahub/ingestion/source/identity/azure_ad.py,sha256=GdmJFD4UMsb5353Z7phXRf-YsXR2woGLRJwBXUkgXq0,28809
322
320
  datahub/ingestion/source/identity/okta.py,sha256=PnRokWLG8wSoNZlXJiRZiW6APTEHO09q4n2j_l6m3V0,30756
321
+ datahub/ingestion/source/kafka/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
322
+ datahub/ingestion/source/kafka/kafka.py,sha256=wC5tZ31fvXWLjse9qH3YONPhMxXIZ9QV9vbBcWDAP_w,25352
323
+ datahub/ingestion/source/kafka/kafka_connect.py,sha256=5KUlhn3876c41Z3kx5l4oJhbu0ekXZQRdxmu52vb_v8,55167
324
+ datahub/ingestion/source/kafka/kafka_schema_registry_base.py,sha256=13XjSwqyVhH1CJUFHAbWdmmv_Rw0Ju_9HQdBmIzPNNA,566
323
325
  datahub/ingestion/source/looker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
324
326
  datahub/ingestion/source/looker/lkml_patched.py,sha256=XShEU7Wbz0DubDhYMjKf9wjKZrBJa2XPg9MIjp8rPhk,733
325
327
  datahub/ingestion/source/looker/looker_common.py,sha256=cIrsc0nDEsODxx9tjvSm2A3-OlCkPKCu4W2L8PxzLWs,59926
@@ -447,7 +449,7 @@ datahub/ingestion/source/sql/mysql.py,sha256=nDWK4YbqomcJgnit9b8geUGrp_3eix4bt0_
447
449
  datahub/ingestion/source/sql/oracle.py,sha256=exJJvMSC42Oj0IiFtpAPFatWkRu9mVu8sd7ofOdanqU,22460
448
450
  datahub/ingestion/source/sql/postgres.py,sha256=uC1kYEI8VdxiZ1Y9IxMWzwmg11wtMqYN0e2fkok1rxo,11972
449
451
  datahub/ingestion/source/sql/presto.py,sha256=PB-CS5MX2dSRFRHjlxfkLHGXLZXFNCsVAAyRBtY6HMg,3611
450
- datahub/ingestion/source/sql/sql_common.py,sha256=Fv9ccVE-4qp9R4DdhQdy3er9u4ojHro73jogguSrpNU,51794
452
+ datahub/ingestion/source/sql/sql_common.py,sha256=SZkPK7aUiTdMHf5alG5724i1pyAn1jFUjcuDkKUoeL0,51813
451
453
  datahub/ingestion/source/sql/sql_config.py,sha256=M-l_uXau0ODolLZHBzAXhy-Rq5yYxvJ6cLbCIea7Mww,9449
452
454
  datahub/ingestion/source/sql/sql_generic.py,sha256=9AERvkK8kdJUeDOzCYJDb93xdv6Z4DGho0NfeHj5Uyg,2740
453
455
  datahub/ingestion/source/sql/sql_generic_profiler.py,sha256=vWna43rv9ClfK9MBNTx8tdoUr35nD5d8ppgeamEEhSQ,12528
@@ -859,10 +861,10 @@ datahub/sql_parsing/_sqlglot_patch.py,sha256=iYJ8zOThHqqbamD5jdNr9iHTWD7ewNeHzPi
859
861
  datahub/sql_parsing/query_types.py,sha256=AxoWZiLuXM6iZ2AXdSlD82gqPNr1ZFh78wn2MSTu2wY,2479
860
862
  datahub/sql_parsing/schema_resolver.py,sha256=9wbJT80K4nsIHHOuQLso9QoFQAYwfSJZnqHwsaU3UTY,10197
861
863
  datahub/sql_parsing/sql_parsing_aggregator.py,sha256=YzYmWNQjXuQYcfh91ZnqjuzCDWdmZpE-ZX14Kz6XFMs,69938
862
- datahub/sql_parsing/sql_parsing_common.py,sha256=ptTwHXdAHuw0jbhIalEkB3mncDx5LsQeclRPecPzPFE,2339
864
+ datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
863
865
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
864
- datahub/sql_parsing/sqlglot_lineage.py,sha256=jVMYZ5S36rHALiGmefQSGliRVcafLrodmfBUrTu_-9A,44787
865
- datahub/sql_parsing/sqlglot_utils.py,sha256=nVFG0UnHibApwNyJMEjdnJxgHreR9p9lN0Z3X6vhbnU,14448
866
+ datahub/sql_parsing/sqlglot_lineage.py,sha256=K532_zvj7vEt2Ci2d3kxYZLZkq4S0ECBRZ2nHuY11a4,45927
867
+ datahub/sql_parsing/sqlglot_utils.py,sha256=8MYzkyekhup3ihVStRPuwneWPNu17xhBg5SG8iVfFRY,14431
866
868
  datahub/sql_parsing/tool_meta_extractor.py,sha256=pE-pkRKBfNTXEJkaQM9NlG807mc-X6OtetgskJySCs8,2908
867
869
  datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
868
870
  datahub/telemetry/stats.py,sha256=YltbtC3fe6rl1kcxn1A-mSnVpECTPm5k-brrUt7QxTI,967
@@ -888,7 +890,7 @@ datahub/utilities/dedup_list.py,sha256=dUSpe1AajfuwlHVJKNv-CzDXSCkaw0HgSMOsxqUkQ
888
890
  datahub/utilities/delayed_iter.py,sha256=XlsI0DCXkVVejFKOW_uMT0E8DTqqOHQN3Ooak4EcULE,645
889
891
  datahub/utilities/delta.py,sha256=hkpF8W7Lvg2gUJBQR3mmIzOxsRQ6i5cchRPFlAVoV10,1128
890
892
  datahub/utilities/docs_build.py,sha256=uFMK3z1d4BExpsrvguHunidbEDAzQ8hoOP7iQ0A_IVw,211
891
- datahub/utilities/file_backed_collections.py,sha256=5Og8AJV5WxbY7N8QOQWcRh8aNa7C3zrFKyAMaHddRmE,20232
893
+ datahub/utilities/file_backed_collections.py,sha256=tS6hN0kxMaaTb8rB-vDqAd973mTKshERSC55JIe_3Cw,20557
892
894
  datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
893
895
  datahub/utilities/hive_schema_to_avro.py,sha256=2-7NI9haCAYbyUmHTb-QPxjn4WbmnDDKAIGmHJ11-1E,11622
894
896
  datahub/utilities/is_pytest.py,sha256=2m9T4S9IIKhI5RfTqrB2ZmumzHocdxBHpM1HroWj2XQ,138
@@ -965,8 +967,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
965
967
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
966
968
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
967
969
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
968
- acryl_datahub-0.14.1.13rc2.dist-info/METADATA,sha256=ocLjEMF3mpEq7e1fUg7cTaA_SkEUYNfDKL7DSKp3TEY,171138
969
- acryl_datahub-0.14.1.13rc2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
970
- acryl_datahub-0.14.1.13rc2.dist-info/entry_points.txt,sha256=tOZmkU-DL9O6-BRhkwS095b_y2RlSKQ7j0hvp57iw1Q,9420
971
- acryl_datahub-0.14.1.13rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
972
- acryl_datahub-0.14.1.13rc2.dist-info/RECORD,,
970
+ acryl_datahub-0.14.1.13rc4.dist-info/METADATA,sha256=pKvm7N_prnqdEylMVyCtB6aReDQm2T-0LmvkhiwUdrU,171138
971
+ acryl_datahub-0.14.1.13rc4.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
972
+ acryl_datahub-0.14.1.13rc4.dist-info/entry_points.txt,sha256=VcQx0dnqaYLyeY_L5OaX7bLmmE-Il7TAXkxCKvEn2bA,9432
973
+ acryl_datahub-0.14.1.13rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
974
+ acryl_datahub-0.14.1.13rc4.dist-info/RECORD,,
@@ -56,8 +56,8 @@ hive = datahub.ingestion.source.sql.hive:HiveSource
56
56
  hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource
57
57
  iceberg = datahub.ingestion.source.iceberg.iceberg:IcebergSource
58
58
  json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource
59
- kafka = datahub.ingestion.source.kafka:KafkaSource
60
- kafka-connect = datahub.ingestion.source.kafka_connect:KafkaConnectSource
59
+ kafka = datahub.ingestion.source.kafka.kafka:KafkaSource
60
+ kafka-connect = datahub.ingestion.source.kafka.kafka_connect:KafkaConnectSource
61
61
  ldap = datahub.ingestion.source.ldap:LDAPSource
62
62
  looker = datahub.ingestion.source.looker.looker_source:LookerDashboardSource
63
63
  lookml = datahub.ingestion.source.looker.lookml_source:LookMLSource
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.14.1.13rc2"
6
+ __version__ = "0.14.1.13rc4"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -1,6 +1,7 @@
1
1
  from pydantic import Field, validator
2
2
 
3
- from datahub.configuration.common import ConfigModel
3
+ from datahub.configuration.common import ConfigModel, ConfigurationError
4
+ from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
4
5
  from datahub.configuration.validate_host_port import validate_host_port
5
6
 
6
7
 
@@ -36,6 +37,16 @@ class KafkaConsumerConnectionConfig(_KafkaConnectionConfig):
36
37
  description="Extra consumer config serialized as JSON. These options will be passed into Kafka's DeserializingConsumer. See https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#deserializingconsumer and https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md .",
37
38
  )
38
39
 
40
+ @validator("consumer_config")
41
+ @classmethod
42
+ def resolve_callback(cls, value: dict) -> dict:
43
+ if CallableConsumerConfig.is_callable_config(value):
44
+ try:
45
+ value = CallableConsumerConfig(value).callable_config()
46
+ except Exception as e:
47
+ raise ConfigurationError(e)
48
+ return value
49
+
39
50
 
40
51
  class KafkaProducerConnectionConfig(_KafkaConnectionConfig):
41
52
  """Configuration class for holding connectivity information for Kafka producers"""
@@ -0,0 +1,35 @@
1
+ import logging
2
+ from typing import Any, Dict, Optional
3
+
4
+ from datahub.ingestion.api.registry import import_path
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class CallableConsumerConfig:
10
+ CALLBACK_ATTRIBUTE: str = "oauth_cb"
11
+
12
+ def __init__(self, config: Dict[str, Any]):
13
+ self._config = config
14
+
15
+ self._resolve_oauth_callback()
16
+
17
+ def callable_config(self) -> Dict[str, Any]:
18
+ return self._config
19
+
20
+ @staticmethod
21
+ def is_callable_config(config: Dict[str, Any]) -> bool:
22
+ return CallableConsumerConfig.CALLBACK_ATTRIBUTE in config
23
+
24
+ def get_call_back_attribute(self) -> Optional[str]:
25
+ return self._config.get(CallableConsumerConfig.CALLBACK_ATTRIBUTE)
26
+
27
+ def _resolve_oauth_callback(self) -> None:
28
+ if not self.get_call_back_attribute():
29
+ return
30
+
31
+ call_back = self.get_call_back_attribute()
32
+
33
+ assert call_back # to silent lint
34
+ # Set the callback
35
+ self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = import_path(call_back)
@@ -16,8 +16,10 @@ from confluent_kafka.schema_registry.schema_registry_client import (
16
16
  from datahub.ingestion.extractor import protobuf_util, schema_util
17
17
  from datahub.ingestion.extractor.json_schema_util import JsonSchemaTranslator
18
18
  from datahub.ingestion.extractor.protobuf_util import ProtobufSchema
19
- from datahub.ingestion.source.kafka import KafkaSourceConfig, KafkaSourceReport
20
- from datahub.ingestion.source.kafka_schema_registry_base import KafkaSchemaRegistryBase
19
+ from datahub.ingestion.source.kafka.kafka import KafkaSourceConfig, KafkaSourceReport
20
+ from datahub.ingestion.source.kafka.kafka_schema_registry_base import (
21
+ KafkaSchemaRegistryBase,
22
+ )
21
23
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
22
24
  KafkaSchema,
23
25
  SchemaField,
File without changes
@@ -18,6 +18,7 @@ from confluent_kafka.schema_registry.schema_registry_client import SchemaRegistr
18
18
 
19
19
  from datahub.configuration.common import AllowDenyPattern
20
20
  from datahub.configuration.kafka import KafkaConsumerConnectionConfig
21
+ from datahub.configuration.kafka_consumer_config import CallableConsumerConfig
21
22
  from datahub.configuration.source_common import (
22
23
  DatasetSourceConfigMixin,
23
24
  LowerCaseDatasetUrnConfigMixin,
@@ -49,7 +50,9 @@ from datahub.ingestion.api.source import (
49
50
  )
50
51
  from datahub.ingestion.api.workunit import MetadataWorkUnit
51
52
  from datahub.ingestion.source.common.subtypes import DatasetSubTypes
52
- from datahub.ingestion.source.kafka_schema_registry_base import KafkaSchemaRegistryBase
53
+ from datahub.ingestion.source.kafka.kafka_schema_registry_base import (
54
+ KafkaSchemaRegistryBase,
55
+ )
53
56
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
54
57
  StaleEntityRemovalHandler,
55
58
  StaleEntityRemovalSourceReport,
@@ -143,7 +146,7 @@ class KafkaSourceConfig(
143
146
  def get_kafka_consumer(
144
147
  connection: KafkaConsumerConnectionConfig,
145
148
  ) -> confluent_kafka.Consumer:
146
- return confluent_kafka.Consumer(
149
+ consumer = confluent_kafka.Consumer(
147
150
  {
148
151
  "group.id": "test",
149
152
  "bootstrap.servers": connection.bootstrap,
@@ -151,6 +154,13 @@ def get_kafka_consumer(
151
154
  }
152
155
  )
153
156
 
157
+ if CallableConsumerConfig.is_callable_config(connection.consumer_config):
158
+ # As per documentation, we need to explicitly call the poll method to make sure OAuth callback gets executed
159
+ # https://docs.confluent.io/platform/current/clients/confluent-kafka-python/html/index.html#kafka-client-configuration
160
+ consumer.poll(timeout=30)
161
+
162
+ return consumer
163
+
154
164
 
155
165
  @dataclass
156
166
  class KafkaSourceReport(StaleEntityRemovalSourceReport):
@@ -32,6 +32,7 @@ from datahub.emitter.mce_builder import (
32
32
  make_data_platform_urn,
33
33
  make_dataplatform_instance_urn,
34
34
  make_dataset_urn_with_platform_instance,
35
+ make_schema_field_urn,
35
36
  make_tag_urn,
36
37
  )
37
38
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -669,7 +670,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
669
670
  )
670
671
 
671
672
  source_fields = [
672
- f"urn:li:schemaField:({dataset_urn},{f})"
673
+ make_schema_field_urn(dataset_urn, f)
673
674
  for f in fk_dict["constrained_columns"]
674
675
  ]
675
676
  foreign_dataset = make_dataset_urn_with_platform_instance(
@@ -679,7 +680,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
679
680
  env=self.config.env,
680
681
  )
681
682
  foreign_fields = [
682
- f"urn:li:schemaField:({foreign_dataset},{f})"
683
+ make_schema_field_urn(foreign_dataset, f)
683
684
  for f in fk_dict["referred_columns"]
684
685
  ]
685
686
 
@@ -21,6 +21,9 @@ DIALECTS_WITH_CASE_INSENSITIVE_COLS = {
21
21
  # See more below:
22
22
  # https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/acreldb/n0ejgx4895bofnn14rlguktfx5r3.htm
23
23
  "teradata",
24
+ # For SQL server, the default collation rules mean that all identifiers (schema, table, column names)
25
+ # are case preserving but case insensitive.
26
+ "mssql",
24
27
  }
25
28
  DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = {
26
29
  # In some dialects, column identifiers are effectively case insensitive
@@ -28,6 +31,9 @@ DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = {
28
31
  # automatically lowercase unquoted identifiers.
29
32
  "snowflake",
30
33
  }
34
+ assert DIALECTS_WITH_DEFAULT_UPPERCASE_COLS.issubset(
35
+ DIALECTS_WITH_CASE_INSENSITIVE_COLS
36
+ )
31
37
 
32
38
 
33
39
  class QueryType(enum.Enum):
@@ -5,7 +5,7 @@ import functools
5
5
  import logging
6
6
  import traceback
7
7
  from collections import defaultdict
8
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
8
+ from typing import Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
9
9
 
10
10
  import pydantic.dataclasses
11
11
  import sqlglot
@@ -873,6 +873,49 @@ def _translate_internal_column_lineage(
873
873
  )
874
874
 
875
875
 
876
+ _StrOrNone = TypeVar("_StrOrNone", str, Optional[str])
877
+
878
+
879
+ def _normalize_db_or_schema(
880
+ db_or_schema: _StrOrNone,
881
+ dialect: sqlglot.Dialect,
882
+ ) -> _StrOrNone:
883
+ if db_or_schema is None:
884
+ return None
885
+
886
+ # In snowflake, table identifiers must be uppercased to match sqlglot's behavior.
887
+ if is_dialect_instance(dialect, "snowflake"):
888
+ return db_or_schema.upper()
889
+
890
+ # In mssql, table identifiers must be lowercased.
891
+ elif is_dialect_instance(dialect, "mssql"):
892
+ return db_or_schema.lower()
893
+
894
+ return db_or_schema
895
+
896
+
897
+ def _simplify_select_into(statement: sqlglot.exp.Expression) -> sqlglot.exp.Expression:
898
+ """
899
+ Check if the expression is a SELECT INTO statement. If so, converts it into a CTAS.
900
+ Other expressions are returned as-is.
901
+ """
902
+
903
+ if not (isinstance(statement, sqlglot.exp.Select) and statement.args.get("into")):
904
+ return statement
905
+
906
+ # Convert from SELECT <cols> INTO <out> <expr>
907
+ # to CREATE TABLE <out> AS SELECT <cols> <expr>
908
+ into_expr: sqlglot.exp.Into = statement.args["into"].pop()
909
+ into_table = into_expr.this
910
+
911
+ create = sqlglot.exp.Create(
912
+ this=into_table,
913
+ kind="TABLE",
914
+ expression=statement,
915
+ )
916
+ return create
917
+
918
+
876
919
  def _sqlglot_lineage_inner(
877
920
  sql: sqlglot.exp.ExpOrStr,
878
921
  schema_resolver: SchemaResolverInterface,
@@ -885,12 +928,9 @@ def _sqlglot_lineage_inner(
885
928
  else:
886
929
  dialect = get_dialect(default_dialect)
887
930
 
888
- if is_dialect_instance(dialect, "snowflake"):
889
- # in snowflake, table identifiers must be uppercased to match sqlglot's behavior.
890
- if default_db:
891
- default_db = default_db.upper()
892
- if default_schema:
893
- default_schema = default_schema.upper()
931
+ default_db = _normalize_db_or_schema(default_db, dialect)
932
+ default_schema = _normalize_db_or_schema(default_schema, dialect)
933
+
894
934
  if is_dialect_instance(dialect, "redshift") and not default_schema:
895
935
  # On Redshift, there's no "USE SCHEMA <schema>" command. The default schema
896
936
  # is public, and "current schema" is the one at the front of the search path.
@@ -918,6 +958,8 @@ def _sqlglot_lineage_inner(
918
958
  # original_statement.sql(pretty=True, dialect=dialect),
919
959
  # )
920
960
 
961
+ statement = _simplify_select_into(statement)
962
+
921
963
  # Make sure the tables are resolved with the default db / schema.
922
964
  # This only works for Unionable statements. For other types of statements,
923
965
  # we have to do it manually afterwards, but that's slightly lower accuracy
@@ -61,7 +61,7 @@ def is_dialect_instance(
61
61
  else:
62
62
  platforms = list(platforms)
63
63
 
64
- dialects = [sqlglot.Dialect.get_or_raise(platform) for platform in platforms]
64
+ dialects = [get_dialect(platform) for platform in platforms]
65
65
 
66
66
  if any(isinstance(dialect, dialect_class.__class__) for dialect_class in dialects):
67
67
  return True
@@ -228,6 +228,12 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
228
228
  else:
229
229
  self._conn = ConnectionWrapper()
230
230
 
231
+ if sqlite3.sqlite_version_info < (3, 24, 0):
232
+ # We use the ON CONFLICT clause to implement UPSERTs with sqlite.
233
+ # This was added in 3.24.0 from 2018-06-04.
234
+ # See https://www.sqlite.org/lang_conflict.html
235
+ raise RuntimeError("SQLite version 3.24.0 or later is required")
236
+
231
237
  # We keep a small cache in memory to avoid having to serialize/deserialize
232
238
  # data from the database too often. We use an OrderedDict to build
233
239
  # a poor-man's LRU cache.