acryl-datahub 1.0.0.3rc1__py3-none-any.whl → 1.0.0.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.3rc1.dist-info → acryl_datahub-1.0.0.3rc2.dist-info}/METADATA +2513 -2513
- {acryl_datahub-1.0.0.3rc1.dist-info → acryl_datahub-1.0.0.3rc2.dist-info}/RECORD +16 -15
- datahub/_version.py +1 -1
- datahub/emitter/mcp.py +5 -1
- datahub/ingestion/source/mlflow.py +19 -6
- datahub/ingestion/source/powerbi/config.py +12 -0
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/testing/mcp_diff.py +15 -2
- {acryl_datahub-1.0.0.3rc1.dist-info → acryl_datahub-1.0.0.3rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.3rc1.dist-info → acryl_datahub-1.0.0.3rc2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.3rc1.dist-info → acryl_datahub-1.0.0.3rc2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.3rc1.dist-info → acryl_datahub-1.0.0.3rc2.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
acryl_datahub-1.0.0.
|
|
1
|
+
acryl_datahub-1.0.0.3rc2.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
2
2
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
3
3
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
4
|
-
datahub/_version.py,sha256=
|
|
4
|
+
datahub/_version.py,sha256=wKoNLhdfRXZqqQqju-C7yvPFz3YKQceonahT8wrZq6Y,323
|
|
5
5
|
datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
|
|
6
6
|
datahub/errors.py,sha256=BzKdcmYseHOt36zfjJXc17WNutFhp9Y23cU_L6cIkxc,612
|
|
7
7
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -121,7 +121,7 @@ datahub/emitter/enum_helpers.py,sha256=QBOEUu_hDCvyL_v4ayNQV8XwJbf5zKyu0Xat0mI1K
|
|
|
121
121
|
datahub/emitter/generic_emitter.py,sha256=i37ZFm9VR_tmiZm9kIypEkQEB_cLKbzj_tJvViN-fm8,828
|
|
122
122
|
datahub/emitter/kafka_emitter.py,sha256=Uix1W1WaXF8VqUTUfzdRZKca2XrR1w50Anx2LVkROlc,5822
|
|
123
123
|
datahub/emitter/mce_builder.py,sha256=i-iLLdnuy7h1JrzwC2sCtQthbso-cNj1uijOQZKHbeA,16717
|
|
124
|
-
datahub/emitter/mcp.py,sha256=
|
|
124
|
+
datahub/emitter/mcp.py,sha256=v7tKlIFX4s7f77KQYeFww8QbOQu6-qU609VeQiUkcsY,9796
|
|
125
125
|
datahub/emitter/mcp_builder.py,sha256=8IwJAlolQkPpMqQJPLtGrsUqAcuFNs98nrI5iYUxgaU,11920
|
|
126
126
|
datahub/emitter/mcp_patch_builder.py,sha256=u7cpW6DkiN7KpLapmMaXgL_FneoN69boxiANbVgMdSI,4564
|
|
127
127
|
datahub/emitter/request_helper.py,sha256=HpI9a9W0TzoVbrs584rF8P8w-IT_iKLmvYmO_6IHhXs,1008
|
|
@@ -205,7 +205,7 @@ datahub/ingestion/source/ge_profiling_config.py,sha256=FlWfXoVoayabVXNMB9qETEU0G
|
|
|
205
205
|
datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
|
|
206
206
|
datahub/ingestion/source/ldap.py,sha256=CNr3foofIpoCXu_GGqfcajlQE2qkHr5isYwVcDutdkk,18695
|
|
207
207
|
datahub/ingestion/source/metabase.py,sha256=j8DRV2GvisezidL1JZ5HJLF_hdFdtvaoyDoEdEyh0Ks,32603
|
|
208
|
-
datahub/ingestion/source/mlflow.py,sha256=
|
|
208
|
+
datahub/ingestion/source/mlflow.py,sha256=fh7izN9jlSwbpGIrEyJktlmwFZR5vNG9z9L5VQ31k_4,33141
|
|
209
209
|
datahub/ingestion/source/mode.py,sha256=_FKZutF-59w0pYhko6HSVL3yjjYNd329-2DJmyfDqF8,64492
|
|
210
210
|
datahub/ingestion/source/mongodb.py,sha256=2C2Cxn8DXL53IbNiywIuKt8UT_EMcPg9f8su-OPSNGU,21237
|
|
211
211
|
datahub/ingestion/source/nifi.py,sha256=D1gBXxdpLuUQ0eurwofIR_SGg1rHGhwk3qxsWI1PT9c,56882
|
|
@@ -376,15 +376,16 @@ datahub/ingestion/source/metadata/lineage.py,sha256=2iK-hsORWm7NSvMZcG4D5hb8_PH5
|
|
|
376
376
|
datahub/ingestion/source/neo4j/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
377
377
|
datahub/ingestion/source/neo4j/neo4j_source.py,sha256=O3jjdnsx7IyYPBLbxowL85Qo4zs4H-maMOH4-6ZNCk4,13063
|
|
378
378
|
datahub/ingestion/source/powerbi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
379
|
-
datahub/ingestion/source/powerbi/config.py,sha256
|
|
379
|
+
datahub/ingestion/source/powerbi/config.py,sha256=-gof-85gqS_cft2blp5Uw5TVypii4T_bl8XhTZUVlgc,24707
|
|
380
380
|
datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py,sha256=-njW1kJOy-LY5JFwJLhVQ0bMBj9NQz5TZhQqsSi_KsM,2285
|
|
381
381
|
datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule,sha256=5df3qvalCS9hZ46DPXs6XDcw9-IofGf8Eol_rUC7LHI,20329
|
|
382
382
|
datahub/ingestion/source/powerbi/powerbi.py,sha256=b9zNeT9aS7v2GWUL1SROnIMwQwAFX0YTO2UNQMLWItc,56450
|
|
383
383
|
datahub/ingestion/source/powerbi/m_query/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
384
|
-
datahub/ingestion/source/powerbi/m_query/data_classes.py,sha256=
|
|
384
|
+
datahub/ingestion/source/powerbi/m_query/data_classes.py,sha256=l_L6DzOWMShOWGtVclcf4JtNWzSINuwJka59LjwRLCk,2091
|
|
385
385
|
datahub/ingestion/source/powerbi/m_query/native_sql_parser.py,sha256=zzKVDGeUM3Yv3-zNah4D6mSnr6jXsstNuLmzczcPQEE,3683
|
|
386
|
+
datahub/ingestion/source/powerbi/m_query/odbc.py,sha256=fZgl8-M5s3Y-3U9OVQs7ttc8FTDbzodIM2HJtFmPNI8,5405
|
|
386
387
|
datahub/ingestion/source/powerbi/m_query/parser.py,sha256=5KqhUwj9H9yL9ZMPP9oSeVGiZjvXjw6Iu_HrGr95E5M,5876
|
|
387
|
-
datahub/ingestion/source/powerbi/m_query/pattern_handler.py,sha256=
|
|
388
|
+
datahub/ingestion/source/powerbi/m_query/pattern_handler.py,sha256=MqZj7VBf9ppKYrA-dRaOVGFpotLFqZditwOD-6ynkFg,41635
|
|
388
389
|
datahub/ingestion/source/powerbi/m_query/resolver.py,sha256=ISH8Xjx51q2S81fn2v5RhCCU-kRAW3juxM0rMFs4TDo,17413
|
|
389
390
|
datahub/ingestion/source/powerbi/m_query/tree_function.py,sha256=NIKNNHAE4kTJefTM1WR-StJi9NuingaRYn_mS_kV6A8,6180
|
|
390
391
|
datahub/ingestion/source/powerbi/m_query/validator.py,sha256=crG-VZy2XPieiDliP9yVMgiFcc8b2xbZyDFEATXqEAQ,1155
|
|
@@ -450,7 +451,7 @@ datahub/ingestion/source/snowflake/constants.py,sha256=XCW3vw4JfLn_s8-oXBX6WFNMP
|
|
|
450
451
|
datahub/ingestion/source/snowflake/oauth_config.py,sha256=ol9D3RmruGStJAeL8PYSQguSqcD2HfkjPkMF2AB_eZs,1277
|
|
451
452
|
datahub/ingestion/source/snowflake/oauth_generator.py,sha256=fu2VnREGuJXeTqIV2jx4TwieVnznf83HQkrE0h2DGGM,3423
|
|
452
453
|
datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81xxdeizJn9nJCZ_nMIXgk9N6pEk5o,4803
|
|
453
|
-
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=
|
|
454
|
+
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=SD2agFE64WgEDbQHPXQjAIP4gsHT1G9H8X_r-RvKGas,20804
|
|
454
455
|
datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=pEw2O9xoTSIWDiROlkF8k4oj5zBjkqTnynLvut08yhc,17796
|
|
455
456
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
456
457
|
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=FBmiONx4EGHWV8RNJT6zHZyntKinPFFyd2oKbTUIbhE,21319
|
|
@@ -462,7 +463,7 @@ datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=1yGBbs2aWIdHnrwgeT
|
|
|
462
463
|
datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=_37-AQyI4uGt4fu-d3v2eAWzQ3uG835ZQxMjFwGYCng,57193
|
|
463
464
|
datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=maZyFkfrbVogEFM0tTKRiNp9c_1muv6YfleSd3q0umI,6341
|
|
464
465
|
datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
|
|
465
|
-
datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=
|
|
466
|
+
datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=eA9xh-G1Ydr1OwUUtrbXUWp26hE1jF0zvyKNky_i_nQ,8887
|
|
466
467
|
datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=ySFm7WDk8FW9KjCnX4HQfTqObIrlUS-V8WIHl3j0CTI,24848
|
|
467
468
|
datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=2lmvAeZELTjAzg4Y5E0oY41r1IzVEvg6OHAvVJftSFk,14081
|
|
468
469
|
datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=nAbudDVh9A0kqao3jnIdgBlFNhNk1WIxoU1cofeXkFQ,33905
|
|
@@ -952,7 +953,7 @@ datahub/testing/check_str_enum.py,sha256=yqk0XXHOGteN-IGqCp5JHy0Kca13BnI09ZqKc4N
|
|
|
952
953
|
datahub/testing/compare_metadata_json.py,sha256=mTU5evu7KLS3cx8OLOC1fFxj0eY1J1CGV2PEQZmapos,5361
|
|
953
954
|
datahub/testing/docker_utils.py,sha256=g169iy_jNR_mg0p8X31cChZqjOryutAIHUYLq3xqueY,2415
|
|
954
955
|
datahub/testing/doctest.py,sha256=1_8WEhHZ2eRQtw8vsXKzr9L5zzvs0Tcr6q4mnkyyvtw,295
|
|
955
|
-
datahub/testing/mcp_diff.py,sha256=
|
|
956
|
+
datahub/testing/mcp_diff.py,sha256=1BpQ3hST46cOQi1SmKdsto3j6x6Sk6yHm0vG1w9IDL0,10749
|
|
956
957
|
datahub/testing/pytest_hooks.py,sha256=eifmj0M68AIfjTn_-0vtaBkKl75vNKMjsbYX-pJqmGY,1417
|
|
957
958
|
datahub/upgrade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
958
959
|
datahub/upgrade/upgrade.py,sha256=lf60_dCu51twObAL5E8NqdrW3_2lsnUJUaB9MSEVXwI,16638
|
|
@@ -1045,8 +1046,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1045
1046
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1046
1047
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1047
1048
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1048
|
-
acryl_datahub-1.0.0.
|
|
1049
|
-
acryl_datahub-1.0.0.
|
|
1050
|
-
acryl_datahub-1.0.0.
|
|
1051
|
-
acryl_datahub-1.0.0.
|
|
1052
|
-
acryl_datahub-1.0.0.
|
|
1049
|
+
acryl_datahub-1.0.0.3rc2.dist-info/METADATA,sha256=Iez_7GLl0EEt7MEDlMXlVb-A_-YB-RO4IZJRWSwuLjI,176855
|
|
1050
|
+
acryl_datahub-1.0.0.3rc2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
1051
|
+
acryl_datahub-1.0.0.3rc2.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
|
|
1052
|
+
acryl_datahub-1.0.0.3rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1053
|
+
acryl_datahub-1.0.0.3rc2.dist-info/RECORD,,
|
datahub/_version.py
CHANGED
datahub/emitter/mcp.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import json
|
|
3
|
-
from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple, Union
|
|
3
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union
|
|
4
4
|
|
|
5
5
|
from datahub.emitter.aspect import ASPECT_MAP, JSON_CONTENT_TYPE
|
|
6
6
|
from datahub.emitter.serialization_helper import post_json_transform, pre_json_transform
|
|
@@ -69,6 +69,7 @@ class MetadataChangeProposalWrapper:
|
|
|
69
69
|
aspectName: Union[None, str] = None
|
|
70
70
|
aspect: Union[None, _Aspect] = None
|
|
71
71
|
systemMetadata: Union[None, SystemMetadataClass] = None
|
|
72
|
+
headers: Union[None, Dict[str, str]] = None
|
|
72
73
|
|
|
73
74
|
def __post_init__(self) -> None:
|
|
74
75
|
if self.entityUrn and self.entityType == _ENTITY_TYPE_UNSET:
|
|
@@ -112,6 +113,7 @@ class MetadataChangeProposalWrapper:
|
|
|
112
113
|
auditHeader=self.auditHeader,
|
|
113
114
|
aspectName=self.aspectName,
|
|
114
115
|
systemMetadata=self.systemMetadata,
|
|
116
|
+
headers=self.headers,
|
|
115
117
|
)
|
|
116
118
|
|
|
117
119
|
def make_mcp(self) -> MetadataChangeProposalClass:
|
|
@@ -211,6 +213,7 @@ class MetadataChangeProposalWrapper:
|
|
|
211
213
|
aspectName=mcpc.aspectName,
|
|
212
214
|
aspect=aspect,
|
|
213
215
|
systemMetadata=mcpc.systemMetadata,
|
|
216
|
+
headers=mcpc.headers,
|
|
214
217
|
)
|
|
215
218
|
else:
|
|
216
219
|
return None
|
|
@@ -228,6 +231,7 @@ class MetadataChangeProposalWrapper:
|
|
|
228
231
|
changeType=mcl.changeType,
|
|
229
232
|
auditHeader=mcl.auditHeader,
|
|
230
233
|
systemMetadata=mcl.systemMetadata,
|
|
234
|
+
headers=mcl.headers,
|
|
231
235
|
)
|
|
232
236
|
return cls.try_from_mcpc(mcpc) or mcpc
|
|
233
237
|
|
|
@@ -7,6 +7,7 @@ from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Unio
|
|
|
7
7
|
from mlflow import MlflowClient
|
|
8
8
|
from mlflow.entities import Dataset as MlflowDataset, Experiment, Run
|
|
9
9
|
from mlflow.entities.model_registry import ModelVersion, RegisteredModel
|
|
10
|
+
from mlflow.exceptions import MlflowException
|
|
10
11
|
from mlflow.store.entities import PagedList
|
|
11
12
|
from pydantic.fields import Field
|
|
12
13
|
|
|
@@ -589,8 +590,8 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
589
590
|
)
|
|
590
591
|
return runs
|
|
591
592
|
|
|
592
|
-
@staticmethod
|
|
593
593
|
def _traverse_mlflow_search_func(
|
|
594
|
+
self,
|
|
594
595
|
search_func: Callable[..., PagedList[T]],
|
|
595
596
|
**kwargs: Any,
|
|
596
597
|
) -> Iterable[T]:
|
|
@@ -598,12 +599,24 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
598
599
|
Utility to traverse an MLflow search_* functions which return PagedList.
|
|
599
600
|
"""
|
|
600
601
|
next_page_token = None
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
602
|
+
try:
|
|
603
|
+
while True:
|
|
604
|
+
paged_list = search_func(page_token=next_page_token, **kwargs)
|
|
605
|
+
yield from paged_list.to_list()
|
|
606
|
+
next_page_token = paged_list.token
|
|
607
|
+
if not next_page_token:
|
|
608
|
+
return
|
|
609
|
+
except MlflowException as e:
|
|
610
|
+
if e.error_code == "ENDPOINT_NOT_FOUND":
|
|
611
|
+
self.report.warning(
|
|
612
|
+
title="MLflow API Endpoint Not Found for Experiments.",
|
|
613
|
+
message="Please upgrade to version 1.28.0 or higher to ensure compatibility. Skipping ingestion for experiments and runs.",
|
|
614
|
+
context=None,
|
|
615
|
+
exc=e,
|
|
616
|
+
)
|
|
606
617
|
return
|
|
618
|
+
else:
|
|
619
|
+
raise # Only re-raise other exceptions
|
|
607
620
|
|
|
608
621
|
def _get_latest_version(self, registered_model: RegisteredModel) -> Optional[str]:
|
|
609
622
|
return (
|
|
@@ -192,6 +192,11 @@ class SupportedDataPlatform(Enum):
|
|
|
192
192
|
datahub_data_platform_name="mysql",
|
|
193
193
|
)
|
|
194
194
|
|
|
195
|
+
ODBC = DataPlatformPair(
|
|
196
|
+
powerbi_data_platform_name="Odbc",
|
|
197
|
+
datahub_data_platform_name="odbc",
|
|
198
|
+
)
|
|
199
|
+
|
|
195
200
|
|
|
196
201
|
@dataclass
|
|
197
202
|
class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
@@ -341,6 +346,13 @@ class PowerBiDashboardSourceConfig(
|
|
|
341
346
|
"For Google BigQuery the datasource's server is google bigquery project name. "
|
|
342
347
|
"For Databricks Unity Catalog the datasource's server is workspace FQDN.",
|
|
343
348
|
)
|
|
349
|
+
# ODBC DSN to platform mapping
|
|
350
|
+
dsn_to_platform_name: Dict[str, str] = pydantic.Field(
|
|
351
|
+
default={},
|
|
352
|
+
description="A mapping of ODBC DSN to DataHub data platform name. "
|
|
353
|
+
"For example with an ODBC connection string 'DSN=database' where the database type "
|
|
354
|
+
"is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
|
|
355
|
+
)
|
|
344
356
|
# deprecated warning
|
|
345
357
|
_dataset_type_mapping = pydantic_field_deprecated(
|
|
346
358
|
"dataset_type_mapping",
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Optional, Tuple, Union
|
|
3
|
+
|
|
4
|
+
server_patterns = [
|
|
5
|
+
r"Server=([^:]+)[:][0-9]+/.*",
|
|
6
|
+
r"SERVER=\{([^}]*)\}",
|
|
7
|
+
r"SERVER=([^;]*)",
|
|
8
|
+
r"HOST=\{([^}]*)\}",
|
|
9
|
+
r"HOST=([^;]*)",
|
|
10
|
+
r"DATA SOURCE=\{([^}]*)\}",
|
|
11
|
+
r"DATA SOURCE=([^;]*)",
|
|
12
|
+
r"DSN=\{([^}]*)\}",
|
|
13
|
+
r"DSN=([^;]*)",
|
|
14
|
+
r"Server=([^;]*)",
|
|
15
|
+
r"S3OutputLocation=([^;]*)",
|
|
16
|
+
r"HTTPPath=([^;]*)",
|
|
17
|
+
r"Host=([^;]*)",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
dsn_patterns = [
|
|
21
|
+
r"DSN\s*=\s*\"([^\"]+)\"",
|
|
22
|
+
r"DSN\s*=\s*\'([^\']+)\'",
|
|
23
|
+
r"DSN\s*=\s*([^;]+)",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
platform_patterns = {
|
|
27
|
+
"mysql": r"mysql",
|
|
28
|
+
"postgres": r"post(gre(s|sql)?|gres)",
|
|
29
|
+
"mssql": r"(sql\s*server|mssql|sqlncli)",
|
|
30
|
+
"oracle": r"oracle",
|
|
31
|
+
"db2": r"db2",
|
|
32
|
+
"sqlite": r"sqlite",
|
|
33
|
+
"access": r"(access|\.mdb|\.accdb)",
|
|
34
|
+
"excel": r"(excel|\.xls)",
|
|
35
|
+
"firebird": r"firebird",
|
|
36
|
+
"informix": r"informix",
|
|
37
|
+
"sybase": r"sybase",
|
|
38
|
+
"teradata": r"teradata",
|
|
39
|
+
"hadoop": r"(hadoop|hive)",
|
|
40
|
+
"snowflake": r"snowflake",
|
|
41
|
+
"redshift": r"redshift",
|
|
42
|
+
"bigquery": r"bigquery",
|
|
43
|
+
"athena": r"(athena|aws\s*athena)",
|
|
44
|
+
"databricks": r"(databricks|spark)",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
powerbi_platform_names = {
|
|
48
|
+
"mysql": "MySQL",
|
|
49
|
+
"postgres": "PostgreSQL",
|
|
50
|
+
"mssql": "SQL Server",
|
|
51
|
+
"oracle": "Oracle",
|
|
52
|
+
"db2": "IBM DB2",
|
|
53
|
+
"sqlite": "SQLite",
|
|
54
|
+
"access": "Microsoft Access",
|
|
55
|
+
"excel": "Microsoft Excel",
|
|
56
|
+
"firebird": "Firebird",
|
|
57
|
+
"informix": "IBM Informix",
|
|
58
|
+
"sybase": "SAP Sybase",
|
|
59
|
+
"teradata": "Teradata",
|
|
60
|
+
"hadoop": "Hadoop",
|
|
61
|
+
"snowflake": "Snowflake",
|
|
62
|
+
"redshift": "Amazon Redshift",
|
|
63
|
+
"bigquery": "Google BigQuery",
|
|
64
|
+
"athena": "Amazon Athena",
|
|
65
|
+
"databricks": "Databricks",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def extract_driver(connection_string: str) -> Union[str, None]:
|
|
70
|
+
"""
|
|
71
|
+
Parse an ODBC connection string and extract the driver name.
|
|
72
|
+
Handles whitespace in driver names and various connection string formats.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
connection_string (str): The ODBC connection string
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
str: The extracted driver name, or None if not found
|
|
79
|
+
"""
|
|
80
|
+
# Match DRIVER={driver name} pattern
|
|
81
|
+
driver_match = re.search(r"DRIVER=\{([^}]*)}", connection_string, re.IGNORECASE)
|
|
82
|
+
|
|
83
|
+
if driver_match:
|
|
84
|
+
return driver_match.group(1).strip()
|
|
85
|
+
|
|
86
|
+
# Alternative pattern for DRIVER=driver
|
|
87
|
+
driver_match = re.search(r"DRIVER=([^;]*)", connection_string, re.IGNORECASE)
|
|
88
|
+
|
|
89
|
+
if driver_match:
|
|
90
|
+
return driver_match.group(1).strip()
|
|
91
|
+
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def extract_dsn(connection_string: str) -> Union[str, None]:
|
|
96
|
+
"""
|
|
97
|
+
Extract the DSN value from an ODBC connection string.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
connection_string (str): The ODBC connection string
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
str or None: The extracted DSN value, or None if not found
|
|
104
|
+
"""
|
|
105
|
+
for pattern in dsn_patterns:
|
|
106
|
+
match = re.search(pattern, connection_string, re.IGNORECASE)
|
|
107
|
+
if match:
|
|
108
|
+
return match.group(1).strip()
|
|
109
|
+
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def extract_server(connection_string: str) -> Union[str, None]:
|
|
114
|
+
"""
|
|
115
|
+
Parse an ODBC connection string and extract the server name.
|
|
116
|
+
Handles various parameter names for server (SERVER, Host, Data Source, etc.)
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
connection_string (str): The ODBC connection string
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
str: The extracted server name, or None if not found
|
|
123
|
+
"""
|
|
124
|
+
for pattern in server_patterns:
|
|
125
|
+
server_match = re.search(pattern, connection_string, re.IGNORECASE)
|
|
126
|
+
if server_match:
|
|
127
|
+
return server_match.group(1).strip()
|
|
128
|
+
|
|
129
|
+
# Special case for Athena: extract from AwsRegion if no server found
|
|
130
|
+
region_match = re.search(r"AwsRegion=([^;]*)", connection_string, re.IGNORECASE)
|
|
131
|
+
if region_match:
|
|
132
|
+
return f"aws-athena-{region_match.group(1).strip()}"
|
|
133
|
+
|
|
134
|
+
# Special case for Databricks: try to extract hostname from JDBC URL
|
|
135
|
+
jdbc_match = re.search(r"jdbc:spark://([^:;/]+)", connection_string, re.IGNORECASE)
|
|
136
|
+
if jdbc_match:
|
|
137
|
+
return jdbc_match.group(1).strip()
|
|
138
|
+
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def extract_platform(connection_string: str) -> Tuple[Optional[str], Optional[str]]:
|
|
143
|
+
"""
|
|
144
|
+
Extract the database platform name from the ODBC driver name.
|
|
145
|
+
Returns the lowercase platform name.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
connection_string (str): The ODBC connection string
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
tuple: A tuple containing the normalized platform name and the corresponding
|
|
152
|
+
Power BI platform name, or None if not recognized.
|
|
153
|
+
"""
|
|
154
|
+
driver_name = extract_driver(connection_string)
|
|
155
|
+
if not driver_name:
|
|
156
|
+
return None, None
|
|
157
|
+
|
|
158
|
+
driver_lower = driver_name.lower()
|
|
159
|
+
|
|
160
|
+
for platform, pattern in platform_patterns.items():
|
|
161
|
+
if re.search(pattern, driver_lower):
|
|
162
|
+
return platform, powerbi_platform_names.get(platform)
|
|
163
|
+
|
|
164
|
+
return None, None
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def normalize_platform_name(platform: str) -> Tuple[Optional[str], Optional[str]]:
|
|
168
|
+
"""
|
|
169
|
+
Normalizes the platform name by matching it with predefined patterns and maps it to
|
|
170
|
+
a corresponding Power BI platform name.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
platform (str): The platform name to normalize
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
tuple: A tuple containing the normalized platform name and the corresponding
|
|
177
|
+
Power BI platform name, or None if not recognized.
|
|
178
|
+
"""
|
|
179
|
+
platform_lower = platform.lower()
|
|
180
|
+
|
|
181
|
+
for platform, pattern in platform_patterns.items():
|
|
182
|
+
if re.search(pattern, platform_lower):
|
|
183
|
+
return platform, powerbi_platform_names.get(platform)
|
|
184
|
+
|
|
185
|
+
return None, None
|
|
@@ -29,6 +29,12 @@ from datahub.ingestion.source.powerbi.m_query.data_classes import (
|
|
|
29
29
|
Lineage,
|
|
30
30
|
ReferencedTable,
|
|
31
31
|
)
|
|
32
|
+
from datahub.ingestion.source.powerbi.m_query.odbc import (
|
|
33
|
+
extract_dsn,
|
|
34
|
+
extract_platform,
|
|
35
|
+
extract_server,
|
|
36
|
+
normalize_platform_name,
|
|
37
|
+
)
|
|
32
38
|
from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
|
|
33
39
|
from datahub.metadata.schema_classes import SchemaFieldDataTypeClass
|
|
34
40
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
@@ -155,6 +161,7 @@ class AbstractLineage(ABC):
|
|
|
155
161
|
tree_function.token_values(arg_list)
|
|
156
162
|
),
|
|
157
163
|
)
|
|
164
|
+
logger.debug(f"DB Details: {arguments}")
|
|
158
165
|
|
|
159
166
|
if len(arguments) < 2:
|
|
160
167
|
logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
|
|
@@ -940,6 +947,147 @@ class NativeQueryLineage(AbstractLineage):
|
|
|
940
947
|
)
|
|
941
948
|
|
|
942
949
|
|
|
950
|
+
class OdbcLineage(AbstractLineage):
|
|
951
|
+
def create_lineage(
|
|
952
|
+
self, data_access_func_detail: DataAccessFunctionDetail
|
|
953
|
+
) -> Lineage:
|
|
954
|
+
logger.debug(
|
|
955
|
+
f"Processing {self.get_platform_pair().powerbi_data_platform_name} "
|
|
956
|
+
f"data-access function detail {data_access_func_detail}"
|
|
957
|
+
)
|
|
958
|
+
|
|
959
|
+
connect_string, _ = self.get_db_detail_from_argument(
|
|
960
|
+
data_access_func_detail.arg_list
|
|
961
|
+
)
|
|
962
|
+
|
|
963
|
+
if not connect_string:
|
|
964
|
+
self.reporter.warning(
|
|
965
|
+
title="Can not extract ODBC connect string",
|
|
966
|
+
message="Can not extract ODBC connect string from data access function. Skipping Lineage creation.",
|
|
967
|
+
context=f"table-name={self.table.full_name}, data-access-func-detail={data_access_func_detail}",
|
|
968
|
+
)
|
|
969
|
+
return Lineage.empty()
|
|
970
|
+
|
|
971
|
+
logger.debug(f"ODBC connect string: {connect_string}")
|
|
972
|
+
data_platform, powerbi_platform = extract_platform(connect_string)
|
|
973
|
+
server_name = extract_server(connect_string)
|
|
974
|
+
|
|
975
|
+
if not data_platform:
|
|
976
|
+
dsn = extract_dsn(connect_string)
|
|
977
|
+
if dsn:
|
|
978
|
+
logger.debug(f"Extracted DSN: {dsn}")
|
|
979
|
+
server_name = dsn
|
|
980
|
+
if dsn and self.config.dsn_to_platform_name:
|
|
981
|
+
logger.debug(f"Attempting to map DSN {dsn} to platform")
|
|
982
|
+
name = self.config.dsn_to_platform_name.get(dsn)
|
|
983
|
+
if name:
|
|
984
|
+
logger.debug(f"Found DSN {dsn} mapped to platform {name}")
|
|
985
|
+
data_platform, powerbi_platform = normalize_platform_name(name)
|
|
986
|
+
|
|
987
|
+
if not data_platform or not powerbi_platform:
|
|
988
|
+
self.reporter.warning(
|
|
989
|
+
title="Can not determine ODBC platform",
|
|
990
|
+
message="Can not determine platform from ODBC connect string. Skipping Lineage creation.",
|
|
991
|
+
context=f"table-name={self.table.full_name}, connect-string={connect_string}",
|
|
992
|
+
)
|
|
993
|
+
return Lineage.empty()
|
|
994
|
+
|
|
995
|
+
platform_pair: DataPlatformPair = self.create_platform_pair(
|
|
996
|
+
data_platform, powerbi_platform
|
|
997
|
+
)
|
|
998
|
+
|
|
999
|
+
if not server_name and self.config.server_to_platform_instance:
|
|
1000
|
+
self.reporter.warning(
|
|
1001
|
+
title="Can not determine ODBC server name",
|
|
1002
|
+
message="Can not determine server name with server_to_platform_instance mapping. Skipping Lineage creation.",
|
|
1003
|
+
context=f"table-name={self.table.full_name}",
|
|
1004
|
+
)
|
|
1005
|
+
return Lineage.empty()
|
|
1006
|
+
elif not server_name:
|
|
1007
|
+
server_name = "unknown"
|
|
1008
|
+
|
|
1009
|
+
database_name = None
|
|
1010
|
+
schema_name = None
|
|
1011
|
+
table_name = None
|
|
1012
|
+
qualified_table_name = None
|
|
1013
|
+
|
|
1014
|
+
temp_accessor: Optional[IdentifierAccessor] = (
|
|
1015
|
+
data_access_func_detail.identifier_accessor
|
|
1016
|
+
)
|
|
1017
|
+
|
|
1018
|
+
while temp_accessor:
|
|
1019
|
+
logger.debug(
|
|
1020
|
+
f"identifier = {temp_accessor.identifier} items = {temp_accessor.items}"
|
|
1021
|
+
)
|
|
1022
|
+
if temp_accessor.items.get("Kind") == "Database":
|
|
1023
|
+
database_name = temp_accessor.items["Name"]
|
|
1024
|
+
|
|
1025
|
+
if temp_accessor.items.get("Kind") == "Schema":
|
|
1026
|
+
schema_name = temp_accessor.items["Name"]
|
|
1027
|
+
|
|
1028
|
+
if temp_accessor.items.get("Kind") == "Table":
|
|
1029
|
+
table_name = temp_accessor.items["Name"]
|
|
1030
|
+
|
|
1031
|
+
if temp_accessor.next is not None:
|
|
1032
|
+
temp_accessor = temp_accessor.next
|
|
1033
|
+
else:
|
|
1034
|
+
break
|
|
1035
|
+
|
|
1036
|
+
if (
|
|
1037
|
+
database_name is not None
|
|
1038
|
+
and schema_name is not None
|
|
1039
|
+
and table_name is not None
|
|
1040
|
+
):
|
|
1041
|
+
qualified_table_name = f"{database_name}.{schema_name}.{table_name}"
|
|
1042
|
+
elif database_name is not None and table_name is not None:
|
|
1043
|
+
qualified_table_name = f"{database_name}.{table_name}"
|
|
1044
|
+
|
|
1045
|
+
if not qualified_table_name:
|
|
1046
|
+
self.reporter.warning(
|
|
1047
|
+
title="Can not determine qualified table name",
|
|
1048
|
+
message="Can not determine qualified table name for ODBC data source. Skipping Lineage creation.",
|
|
1049
|
+
context=f"table-name={self.table.full_name}, data-platform={data_platform}",
|
|
1050
|
+
)
|
|
1051
|
+
logger.warning(
|
|
1052
|
+
f"Can not determine qualified table name for ODBC data source {data_platform} "
|
|
1053
|
+
f"table {self.table.full_name}."
|
|
1054
|
+
)
|
|
1055
|
+
return Lineage.empty()
|
|
1056
|
+
|
|
1057
|
+
logger.debug(
|
|
1058
|
+
f"ODBC Platform {data_platform} found qualified table name {qualified_table_name}"
|
|
1059
|
+
)
|
|
1060
|
+
|
|
1061
|
+
urn = make_urn(
|
|
1062
|
+
config=self.config,
|
|
1063
|
+
platform_instance_resolver=self.platform_instance_resolver,
|
|
1064
|
+
data_platform_pair=platform_pair,
|
|
1065
|
+
server=server_name,
|
|
1066
|
+
qualified_table_name=qualified_table_name,
|
|
1067
|
+
)
|
|
1068
|
+
|
|
1069
|
+
column_lineage = self.create_table_column_lineage(urn)
|
|
1070
|
+
|
|
1071
|
+
return Lineage(
|
|
1072
|
+
upstreams=[
|
|
1073
|
+
DataPlatformTable(
|
|
1074
|
+
data_platform_pair=platform_pair,
|
|
1075
|
+
urn=urn,
|
|
1076
|
+
)
|
|
1077
|
+
],
|
|
1078
|
+
column_lineage=column_lineage,
|
|
1079
|
+
)
|
|
1080
|
+
|
|
1081
|
+
@staticmethod
|
|
1082
|
+
def create_platform_pair(
|
|
1083
|
+
data_platform: str, powerbi_platform: str
|
|
1084
|
+
) -> DataPlatformPair:
|
|
1085
|
+
return DataPlatformPair(data_platform, powerbi_platform)
|
|
1086
|
+
|
|
1087
|
+
def get_platform_pair(self) -> DataPlatformPair:
|
|
1088
|
+
return SupportedDataPlatform.ODBC.value
|
|
1089
|
+
|
|
1090
|
+
|
|
943
1091
|
class SupportedPattern(Enum):
|
|
944
1092
|
DATABRICKS_QUERY = (
|
|
945
1093
|
DatabricksLineage,
|
|
@@ -991,6 +1139,11 @@ class SupportedPattern(Enum):
|
|
|
991
1139
|
FunctionName.NATIVE_QUERY,
|
|
992
1140
|
)
|
|
993
1141
|
|
|
1142
|
+
ODBC = (
|
|
1143
|
+
OdbcLineage,
|
|
1144
|
+
FunctionName.ODBC_DATA_ACCESS,
|
|
1145
|
+
)
|
|
1146
|
+
|
|
994
1147
|
def handler(self) -> Type[AbstractLineage]:
|
|
995
1148
|
return self.value[0]
|
|
996
1149
|
|
|
@@ -301,6 +301,7 @@ class SnowflakeV2Config(
|
|
|
301
301
|
default=AllowDenyPattern.allow_all(),
|
|
302
302
|
description=(
|
|
303
303
|
"List of regex patterns for structured properties to include in ingestion."
|
|
304
|
+
" Applied to tags with form `<database>.<schema>.<tag_name>`."
|
|
304
305
|
" Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
|
|
305
306
|
),
|
|
306
307
|
)
|
|
@@ -23,6 +23,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
|
|
|
23
23
|
from datahub.metadata.com.linkedin.pegasus2avro.structured import (
|
|
24
24
|
StructuredPropertyDefinition,
|
|
25
25
|
)
|
|
26
|
+
from datahub.metadata.schema_classes import ChangeTypeClass
|
|
26
27
|
from datahub.metadata.urns import (
|
|
27
28
|
ContainerUrn,
|
|
28
29
|
DatasetUrn,
|
|
@@ -81,7 +82,7 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
81
82
|
def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]:
|
|
82
83
|
for tag in self.data_dictionary.get_all_tags():
|
|
83
84
|
if not self.config.structured_property_pattern.allowed(
|
|
84
|
-
tag.
|
|
85
|
+
tag._id_prefix_as_str()
|
|
85
86
|
):
|
|
86
87
|
continue
|
|
87
88
|
if self.config.extract_tags_as_structured_properties:
|
|
@@ -111,6 +112,8 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
|
|
|
111
112
|
yield MetadataChangeProposalWrapper(
|
|
112
113
|
entityUrn=urn,
|
|
113
114
|
aspect=aspect,
|
|
115
|
+
changeType=ChangeTypeClass.CREATE,
|
|
116
|
+
headers={"If-None-Match": "*"},
|
|
114
117
|
).as_workunit()
|
|
115
118
|
|
|
116
119
|
def _get_tags_on_object_with_propagation(
|
datahub/testing/mcp_diff.py
CHANGED
|
@@ -2,7 +2,7 @@ import dataclasses
|
|
|
2
2
|
import json
|
|
3
3
|
import re
|
|
4
4
|
from collections import defaultdict
|
|
5
|
-
from typing import Any, Dict, List, Sequence, Set, Tuple, Union
|
|
5
|
+
from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
|
|
6
6
|
|
|
7
7
|
import deepdiff.serialization
|
|
8
8
|
import yaml
|
|
@@ -34,6 +34,7 @@ class AspectForDiff:
|
|
|
34
34
|
aspect_name: str
|
|
35
35
|
aspect: Dict[str, Any] = dataclasses.field(hash=False)
|
|
36
36
|
delta_info: "DeltaInfo" = dataclasses.field(hash=False, repr=False)
|
|
37
|
+
headers: Optional[Dict[str, str]] = dataclasses.field(default=None, hash=False)
|
|
37
38
|
|
|
38
39
|
@classmethod
|
|
39
40
|
def create_from_mcp(cls, idx: int, obj: Dict[str, Any]) -> "AspectForDiff":
|
|
@@ -44,6 +45,7 @@ class AspectForDiff:
|
|
|
44
45
|
aspect_name=obj["aspectName"],
|
|
45
46
|
aspect=aspect.get("json", aspect),
|
|
46
47
|
delta_info=DeltaInfo(idx=idx, original=obj),
|
|
48
|
+
headers=obj.get("headers"),
|
|
47
49
|
)
|
|
48
50
|
|
|
49
51
|
def __repr__(self):
|
|
@@ -240,9 +242,12 @@ class MCPDiff:
|
|
|
240
242
|
s.append(serialize_aspect(ga.aspect))
|
|
241
243
|
for (i, old, new), diffs in aspect_diffs.aspects_changed.items():
|
|
242
244
|
s.append(self.report_aspect(old, i, "changed") + ":")
|
|
245
|
+
|
|
246
|
+
print_aspects = False
|
|
243
247
|
for diff_level in diffs:
|
|
244
248
|
s.append(self.report_diff_level(diff_level, i))
|
|
245
|
-
|
|
249
|
+
print_aspects |= self.is_diff_level_on_aspect(diff_level)
|
|
250
|
+
if verbose and print_aspects:
|
|
246
251
|
s.append(f"Old aspect:\n{serialize_aspect(old.aspect)}")
|
|
247
252
|
s.append(f"New aspect:\n{serialize_aspect(new.aspect)}")
|
|
248
253
|
|
|
@@ -271,6 +276,14 @@ class MCPDiff:
|
|
|
271
276
|
f"root[{idx}].", ""
|
|
272
277
|
)
|
|
273
278
|
|
|
279
|
+
@staticmethod
|
|
280
|
+
def is_diff_level_on_aspect(diff: DiffLevel) -> bool:
|
|
281
|
+
skip_print_fields = ["changeType", "headers"]
|
|
282
|
+
try:
|
|
283
|
+
return diff.path(output_format="list")[1] not in skip_print_fields
|
|
284
|
+
except IndexError:
|
|
285
|
+
return True
|
|
286
|
+
|
|
274
287
|
|
|
275
288
|
def serialize_aspect(aspect: Union[AspectForDiff, Dict[str, Any]]) -> str:
|
|
276
289
|
if isinstance(aspect, AspectForDiff): # Unpack aspect
|