acryl-datahub 1.0.0.3rc1__py3-none-any.whl → 1.0.0.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.0.0.3rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.0.0.3rc2.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=R-5q2sde87sdyofKBpzMGjN_yrh8SbPAoOTVYlH3CuU,323
4
+ datahub/_version.py,sha256=wKoNLhdfRXZqqQqju-C7yvPFz3YKQceonahT8wrZq6Y,323
5
5
  datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
6
6
  datahub/errors.py,sha256=BzKdcmYseHOt36zfjJXc17WNutFhp9Y23cU_L6cIkxc,612
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -121,7 +121,7 @@ datahub/emitter/enum_helpers.py,sha256=QBOEUu_hDCvyL_v4ayNQV8XwJbf5zKyu0Xat0mI1K
121
121
  datahub/emitter/generic_emitter.py,sha256=i37ZFm9VR_tmiZm9kIypEkQEB_cLKbzj_tJvViN-fm8,828
122
122
  datahub/emitter/kafka_emitter.py,sha256=Uix1W1WaXF8VqUTUfzdRZKca2XrR1w50Anx2LVkROlc,5822
123
123
  datahub/emitter/mce_builder.py,sha256=i-iLLdnuy7h1JrzwC2sCtQthbso-cNj1uijOQZKHbeA,16717
124
- datahub/emitter/mcp.py,sha256=hAAYziDdkwjazQU0DtWMbQWY8wS09ACrKJbqxoWXdgc,9637
124
+ datahub/emitter/mcp.py,sha256=v7tKlIFX4s7f77KQYeFww8QbOQu6-qU609VeQiUkcsY,9796
125
125
  datahub/emitter/mcp_builder.py,sha256=8IwJAlolQkPpMqQJPLtGrsUqAcuFNs98nrI5iYUxgaU,11920
126
126
  datahub/emitter/mcp_patch_builder.py,sha256=u7cpW6DkiN7KpLapmMaXgL_FneoN69boxiANbVgMdSI,4564
127
127
  datahub/emitter/request_helper.py,sha256=HpI9a9W0TzoVbrs584rF8P8w-IT_iKLmvYmO_6IHhXs,1008
@@ -205,7 +205,7 @@ datahub/ingestion/source/ge_profiling_config.py,sha256=FlWfXoVoayabVXNMB9qETEU0G
205
205
  datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
206
206
  datahub/ingestion/source/ldap.py,sha256=CNr3foofIpoCXu_GGqfcajlQE2qkHr5isYwVcDutdkk,18695
207
207
  datahub/ingestion/source/metabase.py,sha256=j8DRV2GvisezidL1JZ5HJLF_hdFdtvaoyDoEdEyh0Ks,32603
208
- datahub/ingestion/source/mlflow.py,sha256=2K5D95HLUhnx8jP54hK8aKNB0nPaCgXaUgO2PdL6Gto,32539
208
+ datahub/ingestion/source/mlflow.py,sha256=fh7izN9jlSwbpGIrEyJktlmwFZR5vNG9z9L5VQ31k_4,33141
209
209
  datahub/ingestion/source/mode.py,sha256=_FKZutF-59w0pYhko6HSVL3yjjYNd329-2DJmyfDqF8,64492
210
210
  datahub/ingestion/source/mongodb.py,sha256=2C2Cxn8DXL53IbNiywIuKt8UT_EMcPg9f8su-OPSNGU,21237
211
211
  datahub/ingestion/source/nifi.py,sha256=D1gBXxdpLuUQ0eurwofIR_SGg1rHGhwk3qxsWI1PT9c,56882
@@ -376,15 +376,16 @@ datahub/ingestion/source/metadata/lineage.py,sha256=2iK-hsORWm7NSvMZcG4D5hb8_PH5
376
376
  datahub/ingestion/source/neo4j/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
377
377
  datahub/ingestion/source/neo4j/neo4j_source.py,sha256=O3jjdnsx7IyYPBLbxowL85Qo4zs4H-maMOH4-6ZNCk4,13063
378
378
  datahub/ingestion/source/powerbi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
379
- datahub/ingestion/source/powerbi/config.py,sha256=bflLIq6rpZeJ7ULvN2gaAVcSHO5jTJ6vdNPvwo1LH7M,24212
379
+ datahub/ingestion/source/powerbi/config.py,sha256=-gof-85gqS_cft2blp5Uw5TVypii4T_bl8XhTZUVlgc,24707
380
380
  datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py,sha256=-njW1kJOy-LY5JFwJLhVQ0bMBj9NQz5TZhQqsSi_KsM,2285
381
381
  datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule,sha256=5df3qvalCS9hZ46DPXs6XDcw9-IofGf8Eol_rUC7LHI,20329
382
382
  datahub/ingestion/source/powerbi/powerbi.py,sha256=b9zNeT9aS7v2GWUL1SROnIMwQwAFX0YTO2UNQMLWItc,56450
383
383
  datahub/ingestion/source/powerbi/m_query/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
384
- datahub/ingestion/source/powerbi/m_query/data_classes.py,sha256=yDi0C13ko2dVxdLJBYvUuGbT4Q2hxQRse3sL7Ul1ZU0,2050
384
+ datahub/ingestion/source/powerbi/m_query/data_classes.py,sha256=l_L6DzOWMShOWGtVclcf4JtNWzSINuwJka59LjwRLCk,2091
385
385
  datahub/ingestion/source/powerbi/m_query/native_sql_parser.py,sha256=zzKVDGeUM3Yv3-zNah4D6mSnr6jXsstNuLmzczcPQEE,3683
386
+ datahub/ingestion/source/powerbi/m_query/odbc.py,sha256=fZgl8-M5s3Y-3U9OVQs7ttc8FTDbzodIM2HJtFmPNI8,5405
386
387
  datahub/ingestion/source/powerbi/m_query/parser.py,sha256=5KqhUwj9H9yL9ZMPP9oSeVGiZjvXjw6Iu_HrGr95E5M,5876
387
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py,sha256=aOhAb8U4OEZnO4ufnb-Cm3KMpdy-JF6r9YMK3RNZs5A,35906
388
+ datahub/ingestion/source/powerbi/m_query/pattern_handler.py,sha256=MqZj7VBf9ppKYrA-dRaOVGFpotLFqZditwOD-6ynkFg,41635
388
389
  datahub/ingestion/source/powerbi/m_query/resolver.py,sha256=ISH8Xjx51q2S81fn2v5RhCCU-kRAW3juxM0rMFs4TDo,17413
389
390
  datahub/ingestion/source/powerbi/m_query/tree_function.py,sha256=NIKNNHAE4kTJefTM1WR-StJi9NuingaRYn_mS_kV6A8,6180
390
391
  datahub/ingestion/source/powerbi/m_query/validator.py,sha256=crG-VZy2XPieiDliP9yVMgiFcc8b2xbZyDFEATXqEAQ,1155
@@ -450,7 +451,7 @@ datahub/ingestion/source/snowflake/constants.py,sha256=XCW3vw4JfLn_s8-oXBX6WFNMP
450
451
  datahub/ingestion/source/snowflake/oauth_config.py,sha256=ol9D3RmruGStJAeL8PYSQguSqcD2HfkjPkMF2AB_eZs,1277
451
452
  datahub/ingestion/source/snowflake/oauth_generator.py,sha256=fu2VnREGuJXeTqIV2jx4TwieVnznf83HQkrE0h2DGGM,3423
452
453
  datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81xxdeizJn9nJCZ_nMIXgk9N6pEk5o,4803
453
- datahub/ingestion/source/snowflake/snowflake_config.py,sha256=Jm3TW7ed9LYNOZ9egUwkHs2bQv_WlCD6D2QoVxIzxsI,20729
454
+ datahub/ingestion/source/snowflake/snowflake_config.py,sha256=SD2agFE64WgEDbQHPXQjAIP4gsHT1G9H8X_r-RvKGas,20804
454
455
  datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=pEw2O9xoTSIWDiROlkF8k4oj5zBjkqTnynLvut08yhc,17796
455
456
  datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
456
457
  datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=FBmiONx4EGHWV8RNJT6zHZyntKinPFFyd2oKbTUIbhE,21319
@@ -462,7 +463,7 @@ datahub/ingestion/source/snowflake/snowflake_schema.py,sha256=1yGBbs2aWIdHnrwgeT
462
463
  datahub/ingestion/source/snowflake/snowflake_schema_gen.py,sha256=_37-AQyI4uGt4fu-d3v2eAWzQ3uG835ZQxMjFwGYCng,57193
463
464
  datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=maZyFkfrbVogEFM0tTKRiNp9c_1muv6YfleSd3q0umI,6341
464
465
  datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
465
- datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=1eLYTcgmfzDs9xktMTTE74L5SeNP48Qg3uLr9y-Ez3Y,8733
466
+ datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=eA9xh-G1Ydr1OwUUtrbXUWp26hE1jF0zvyKNky_i_nQ,8887
466
467
  datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=ySFm7WDk8FW9KjCnX4HQfTqObIrlUS-V8WIHl3j0CTI,24848
467
468
  datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=2lmvAeZELTjAzg4Y5E0oY41r1IzVEvg6OHAvVJftSFk,14081
468
469
  datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=nAbudDVh9A0kqao3jnIdgBlFNhNk1WIxoU1cofeXkFQ,33905
@@ -952,7 +953,7 @@ datahub/testing/check_str_enum.py,sha256=yqk0XXHOGteN-IGqCp5JHy0Kca13BnI09ZqKc4N
952
953
  datahub/testing/compare_metadata_json.py,sha256=mTU5evu7KLS3cx8OLOC1fFxj0eY1J1CGV2PEQZmapos,5361
953
954
  datahub/testing/docker_utils.py,sha256=g169iy_jNR_mg0p8X31cChZqjOryutAIHUYLq3xqueY,2415
954
955
  datahub/testing/doctest.py,sha256=1_8WEhHZ2eRQtw8vsXKzr9L5zzvs0Tcr6q4mnkyyvtw,295
955
- datahub/testing/mcp_diff.py,sha256=Jk1NluXkKWEMLOE11mHE98lfYE_Gn2GeFuu5TNB3YSs,10198
956
+ datahub/testing/mcp_diff.py,sha256=1BpQ3hST46cOQi1SmKdsto3j6x6Sk6yHm0vG1w9IDL0,10749
956
957
  datahub/testing/pytest_hooks.py,sha256=eifmj0M68AIfjTn_-0vtaBkKl75vNKMjsbYX-pJqmGY,1417
957
958
  datahub/upgrade/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
958
959
  datahub/upgrade/upgrade.py,sha256=lf60_dCu51twObAL5E8NqdrW3_2lsnUJUaB9MSEVXwI,16638
@@ -1045,8 +1046,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1045
1046
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1046
1047
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1047
1048
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1048
- acryl_datahub-1.0.0.3rc1.dist-info/METADATA,sha256=43mPIcmD4ByKfyR6rn8PPgaKNUBSmDmVJnGm1KhBZuo,176855
1049
- acryl_datahub-1.0.0.3rc1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
1050
- acryl_datahub-1.0.0.3rc1.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1051
- acryl_datahub-1.0.0.3rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1052
- acryl_datahub-1.0.0.3rc1.dist-info/RECORD,,
1049
+ acryl_datahub-1.0.0.3rc2.dist-info/METADATA,sha256=Iez_7GLl0EEt7MEDlMXlVb-A_-YB-RO4IZJRWSwuLjI,176855
1050
+ acryl_datahub-1.0.0.3rc2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
1051
+ acryl_datahub-1.0.0.3rc2.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1052
+ acryl_datahub-1.0.0.3rc2.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1053
+ acryl_datahub-1.0.0.3rc2.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0.3rc1"
3
+ __version__ = "1.0.0.3rc2"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
datahub/emitter/mcp.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import dataclasses
2
2
  import json
3
- from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple, Union
3
+ from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union
4
4
 
5
5
  from datahub.emitter.aspect import ASPECT_MAP, JSON_CONTENT_TYPE
6
6
  from datahub.emitter.serialization_helper import post_json_transform, pre_json_transform
@@ -69,6 +69,7 @@ class MetadataChangeProposalWrapper:
69
69
  aspectName: Union[None, str] = None
70
70
  aspect: Union[None, _Aspect] = None
71
71
  systemMetadata: Union[None, SystemMetadataClass] = None
72
+ headers: Union[None, Dict[str, str]] = None
72
73
 
73
74
  def __post_init__(self) -> None:
74
75
  if self.entityUrn and self.entityType == _ENTITY_TYPE_UNSET:
@@ -112,6 +113,7 @@ class MetadataChangeProposalWrapper:
112
113
  auditHeader=self.auditHeader,
113
114
  aspectName=self.aspectName,
114
115
  systemMetadata=self.systemMetadata,
116
+ headers=self.headers,
115
117
  )
116
118
 
117
119
  def make_mcp(self) -> MetadataChangeProposalClass:
@@ -211,6 +213,7 @@ class MetadataChangeProposalWrapper:
211
213
  aspectName=mcpc.aspectName,
212
214
  aspect=aspect,
213
215
  systemMetadata=mcpc.systemMetadata,
216
+ headers=mcpc.headers,
214
217
  )
215
218
  else:
216
219
  return None
@@ -228,6 +231,7 @@ class MetadataChangeProposalWrapper:
228
231
  changeType=mcl.changeType,
229
232
  auditHeader=mcl.auditHeader,
230
233
  systemMetadata=mcl.systemMetadata,
234
+ headers=mcl.headers,
231
235
  )
232
236
  return cls.try_from_mcpc(mcpc) or mcpc
233
237
 
@@ -7,6 +7,7 @@ from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Unio
7
7
  from mlflow import MlflowClient
8
8
  from mlflow.entities import Dataset as MlflowDataset, Experiment, Run
9
9
  from mlflow.entities.model_registry import ModelVersion, RegisteredModel
10
+ from mlflow.exceptions import MlflowException
10
11
  from mlflow.store.entities import PagedList
11
12
  from pydantic.fields import Field
12
13
 
@@ -589,8 +590,8 @@ class MLflowSource(StatefulIngestionSourceBase):
589
590
  )
590
591
  return runs
591
592
 
592
- @staticmethod
593
593
  def _traverse_mlflow_search_func(
594
+ self,
594
595
  search_func: Callable[..., PagedList[T]],
595
596
  **kwargs: Any,
596
597
  ) -> Iterable[T]:
@@ -598,12 +599,24 @@ class MLflowSource(StatefulIngestionSourceBase):
598
599
  Utility to traverse an MLflow search_* functions which return PagedList.
599
600
  """
600
601
  next_page_token = None
601
- while True:
602
- paged_list = search_func(page_token=next_page_token, **kwargs)
603
- yield from paged_list.to_list()
604
- next_page_token = paged_list.token
605
- if not next_page_token:
602
+ try:
603
+ while True:
604
+ paged_list = search_func(page_token=next_page_token, **kwargs)
605
+ yield from paged_list.to_list()
606
+ next_page_token = paged_list.token
607
+ if not next_page_token:
608
+ return
609
+ except MlflowException as e:
610
+ if e.error_code == "ENDPOINT_NOT_FOUND":
611
+ self.report.warning(
612
+ title="MLflow API Endpoint Not Found for Experiments.",
613
+ message="Please upgrade to version 1.28.0 or higher to ensure compatibility. Skipping ingestion for experiments and runs.",
614
+ context=None,
615
+ exc=e,
616
+ )
606
617
  return
618
+ else:
619
+ raise # Only re-raise other exceptions
607
620
 
608
621
  def _get_latest_version(self, registered_model: RegisteredModel) -> Optional[str]:
609
622
  return (
@@ -192,6 +192,11 @@ class SupportedDataPlatform(Enum):
192
192
  datahub_data_platform_name="mysql",
193
193
  )
194
194
 
195
+ ODBC = DataPlatformPair(
196
+ powerbi_data_platform_name="Odbc",
197
+ datahub_data_platform_name="odbc",
198
+ )
199
+
195
200
 
196
201
  @dataclass
197
202
  class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
@@ -341,6 +346,13 @@ class PowerBiDashboardSourceConfig(
341
346
  "For Google BigQuery the datasource's server is google bigquery project name. "
342
347
  "For Databricks Unity Catalog the datasource's server is workspace FQDN.",
343
348
  )
349
+ # ODBC DSN to platform mapping
350
+ dsn_to_platform_name: Dict[str, str] = pydantic.Field(
351
+ default={},
352
+ description="A mapping of ODBC DSN to DataHub data platform name. "
353
+ "For example with an ODBC connection string 'DSN=database' where the database type "
354
+ "is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
355
+ )
344
356
  # deprecated warning
345
357
  _dataset_type_mapping = pydantic_field_deprecated(
346
358
  "dataset_type_mapping",
@@ -75,3 +75,4 @@ class FunctionName(Enum):
75
75
  AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
76
76
  DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
77
77
  MYSQL_DATA_ACCESS = "MySQL.Database"
78
+ ODBC_DATA_ACCESS = "Odbc.DataSource"
@@ -0,0 +1,185 @@
1
+ import re
2
+ from typing import Optional, Tuple, Union
3
+
4
+ server_patterns = [
5
+ r"Server=([^:]+)[:][0-9]+/.*",
6
+ r"SERVER=\{([^}]*)\}",
7
+ r"SERVER=([^;]*)",
8
+ r"HOST=\{([^}]*)\}",
9
+ r"HOST=([^;]*)",
10
+ r"DATA SOURCE=\{([^}]*)\}",
11
+ r"DATA SOURCE=([^;]*)",
12
+ r"DSN=\{([^}]*)\}",
13
+ r"DSN=([^;]*)",
14
+ r"Server=([^;]*)",
15
+ r"S3OutputLocation=([^;]*)",
16
+ r"HTTPPath=([^;]*)",
17
+ r"Host=([^;]*)",
18
+ ]
19
+
20
+ dsn_patterns = [
21
+ r"DSN\s*=\s*\"([^\"]+)\"",
22
+ r"DSN\s*=\s*\'([^\']+)\'",
23
+ r"DSN\s*=\s*([^;]+)",
24
+ ]
25
+
26
+ platform_patterns = {
27
+ "mysql": r"mysql",
28
+ "postgres": r"post(gre(s|sql)?|gres)",
29
+ "mssql": r"(sql\s*server|mssql|sqlncli)",
30
+ "oracle": r"oracle",
31
+ "db2": r"db2",
32
+ "sqlite": r"sqlite",
33
+ "access": r"(access|\.mdb|\.accdb)",
34
+ "excel": r"(excel|\.xls)",
35
+ "firebird": r"firebird",
36
+ "informix": r"informix",
37
+ "sybase": r"sybase",
38
+ "teradata": r"teradata",
39
+ "hadoop": r"(hadoop|hive)",
40
+ "snowflake": r"snowflake",
41
+ "redshift": r"redshift",
42
+ "bigquery": r"bigquery",
43
+ "athena": r"(athena|aws\s*athena)",
44
+ "databricks": r"(databricks|spark)",
45
+ }
46
+
47
+ powerbi_platform_names = {
48
+ "mysql": "MySQL",
49
+ "postgres": "PostgreSQL",
50
+ "mssql": "SQL Server",
51
+ "oracle": "Oracle",
52
+ "db2": "IBM DB2",
53
+ "sqlite": "SQLite",
54
+ "access": "Microsoft Access",
55
+ "excel": "Microsoft Excel",
56
+ "firebird": "Firebird",
57
+ "informix": "IBM Informix",
58
+ "sybase": "SAP Sybase",
59
+ "teradata": "Teradata",
60
+ "hadoop": "Hadoop",
61
+ "snowflake": "Snowflake",
62
+ "redshift": "Amazon Redshift",
63
+ "bigquery": "Google BigQuery",
64
+ "athena": "Amazon Athena",
65
+ "databricks": "Databricks",
66
+ }
67
+
68
+
69
+ def extract_driver(connection_string: str) -> Union[str, None]:
70
+ """
71
+ Parse an ODBC connection string and extract the driver name.
72
+ Handles whitespace in driver names and various connection string formats.
73
+
74
+ Args:
75
+ connection_string (str): The ODBC connection string
76
+
77
+ Returns:
78
+ str: The extracted driver name, or None if not found
79
+ """
80
+ # Match DRIVER={driver name} pattern
81
+ driver_match = re.search(r"DRIVER=\{([^}]*)}", connection_string, re.IGNORECASE)
82
+
83
+ if driver_match:
84
+ return driver_match.group(1).strip()
85
+
86
+ # Alternative pattern for DRIVER=driver
87
+ driver_match = re.search(r"DRIVER=([^;]*)", connection_string, re.IGNORECASE)
88
+
89
+ if driver_match:
90
+ return driver_match.group(1).strip()
91
+
92
+ return None
93
+
94
+
95
+ def extract_dsn(connection_string: str) -> Union[str, None]:
96
+ """
97
+ Extract the DSN value from an ODBC connection string.
98
+
99
+ Args:
100
+ connection_string (str): The ODBC connection string
101
+
102
+ Returns:
103
+ str or None: The extracted DSN value, or None if not found
104
+ """
105
+ for pattern in dsn_patterns:
106
+ match = re.search(pattern, connection_string, re.IGNORECASE)
107
+ if match:
108
+ return match.group(1).strip()
109
+
110
+ return None
111
+
112
+
113
+ def extract_server(connection_string: str) -> Union[str, None]:
114
+ """
115
+ Parse an ODBC connection string and extract the server name.
116
+ Handles various parameter names for server (SERVER, Host, Data Source, etc.)
117
+
118
+ Args:
119
+ connection_string (str): The ODBC connection string
120
+
121
+ Returns:
122
+ str: The extracted server name, or None if not found
123
+ """
124
+ for pattern in server_patterns:
125
+ server_match = re.search(pattern, connection_string, re.IGNORECASE)
126
+ if server_match:
127
+ return server_match.group(1).strip()
128
+
129
+ # Special case for Athena: extract from AwsRegion if no server found
130
+ region_match = re.search(r"AwsRegion=([^;]*)", connection_string, re.IGNORECASE)
131
+ if region_match:
132
+ return f"aws-athena-{region_match.group(1).strip()}"
133
+
134
+ # Special case for Databricks: try to extract hostname from JDBC URL
135
+ jdbc_match = re.search(r"jdbc:spark://([^:;/]+)", connection_string, re.IGNORECASE)
136
+ if jdbc_match:
137
+ return jdbc_match.group(1).strip()
138
+
139
+ return None
140
+
141
+
142
+ def extract_platform(connection_string: str) -> Tuple[Optional[str], Optional[str]]:
143
+ """
144
+ Extract the database platform name from the ODBC driver name.
145
+ Returns the lowercase platform name.
146
+
147
+ Args:
148
+ connection_string (str): The ODBC connection string
149
+
150
+ Returns:
151
+ tuple: A tuple containing the normalized platform name and the corresponding
152
+ Power BI platform name, or None if not recognized.
153
+ """
154
+ driver_name = extract_driver(connection_string)
155
+ if not driver_name:
156
+ return None, None
157
+
158
+ driver_lower = driver_name.lower()
159
+
160
+ for platform, pattern in platform_patterns.items():
161
+ if re.search(pattern, driver_lower):
162
+ return platform, powerbi_platform_names.get(platform)
163
+
164
+ return None, None
165
+
166
+
167
+ def normalize_platform_name(platform: str) -> Tuple[Optional[str], Optional[str]]:
168
+ """
169
+ Normalizes the platform name by matching it with predefined patterns and maps it to
170
+ a corresponding Power BI platform name.
171
+
172
+ Args:
173
+ platform (str): The platform name to normalize
174
+
175
+ Returns:
176
+ tuple: A tuple containing the normalized platform name and the corresponding
177
+ Power BI platform name, or None if not recognized.
178
+ """
179
+ platform_lower = platform.lower()
180
+
181
+ for platform, pattern in platform_patterns.items():
182
+ if re.search(pattern, platform_lower):
183
+ return platform, powerbi_platform_names.get(platform)
184
+
185
+ return None, None
@@ -29,6 +29,12 @@ from datahub.ingestion.source.powerbi.m_query.data_classes import (
29
29
  Lineage,
30
30
  ReferencedTable,
31
31
  )
32
+ from datahub.ingestion.source.powerbi.m_query.odbc import (
33
+ extract_dsn,
34
+ extract_platform,
35
+ extract_server,
36
+ normalize_platform_name,
37
+ )
32
38
  from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
33
39
  from datahub.metadata.schema_classes import SchemaFieldDataTypeClass
34
40
  from datahub.sql_parsing.sqlglot_lineage import (
@@ -155,6 +161,7 @@ class AbstractLineage(ABC):
155
161
  tree_function.token_values(arg_list)
156
162
  ),
157
163
  )
164
+ logger.debug(f"DB Details: {arguments}")
158
165
 
159
166
  if len(arguments) < 2:
160
167
  logger.debug(f"Expected minimum 2 arguments, but got {len(arguments)}")
@@ -940,6 +947,147 @@ class NativeQueryLineage(AbstractLineage):
940
947
  )
941
948
 
942
949
 
950
+ class OdbcLineage(AbstractLineage):
951
+ def create_lineage(
952
+ self, data_access_func_detail: DataAccessFunctionDetail
953
+ ) -> Lineage:
954
+ logger.debug(
955
+ f"Processing {self.get_platform_pair().powerbi_data_platform_name} "
956
+ f"data-access function detail {data_access_func_detail}"
957
+ )
958
+
959
+ connect_string, _ = self.get_db_detail_from_argument(
960
+ data_access_func_detail.arg_list
961
+ )
962
+
963
+ if not connect_string:
964
+ self.reporter.warning(
965
+ title="Can not extract ODBC connect string",
966
+ message="Can not extract ODBC connect string from data access function. Skipping Lineage creation.",
967
+ context=f"table-name={self.table.full_name}, data-access-func-detail={data_access_func_detail}",
968
+ )
969
+ return Lineage.empty()
970
+
971
+ logger.debug(f"ODBC connect string: {connect_string}")
972
+ data_platform, powerbi_platform = extract_platform(connect_string)
973
+ server_name = extract_server(connect_string)
974
+
975
+ if not data_platform:
976
+ dsn = extract_dsn(connect_string)
977
+ if dsn:
978
+ logger.debug(f"Extracted DSN: {dsn}")
979
+ server_name = dsn
980
+ if dsn and self.config.dsn_to_platform_name:
981
+ logger.debug(f"Attempting to map DSN {dsn} to platform")
982
+ name = self.config.dsn_to_platform_name.get(dsn)
983
+ if name:
984
+ logger.debug(f"Found DSN {dsn} mapped to platform {name}")
985
+ data_platform, powerbi_platform = normalize_platform_name(name)
986
+
987
+ if not data_platform or not powerbi_platform:
988
+ self.reporter.warning(
989
+ title="Can not determine ODBC platform",
990
+ message="Can not determine platform from ODBC connect string. Skipping Lineage creation.",
991
+ context=f"table-name={self.table.full_name}, connect-string={connect_string}",
992
+ )
993
+ return Lineage.empty()
994
+
995
+ platform_pair: DataPlatformPair = self.create_platform_pair(
996
+ data_platform, powerbi_platform
997
+ )
998
+
999
+ if not server_name and self.config.server_to_platform_instance:
1000
+ self.reporter.warning(
1001
+ title="Can not determine ODBC server name",
1002
+ message="Can not determine server name with server_to_platform_instance mapping. Skipping Lineage creation.",
1003
+ context=f"table-name={self.table.full_name}",
1004
+ )
1005
+ return Lineage.empty()
1006
+ elif not server_name:
1007
+ server_name = "unknown"
1008
+
1009
+ database_name = None
1010
+ schema_name = None
1011
+ table_name = None
1012
+ qualified_table_name = None
1013
+
1014
+ temp_accessor: Optional[IdentifierAccessor] = (
1015
+ data_access_func_detail.identifier_accessor
1016
+ )
1017
+
1018
+ while temp_accessor:
1019
+ logger.debug(
1020
+ f"identifier = {temp_accessor.identifier} items = {temp_accessor.items}"
1021
+ )
1022
+ if temp_accessor.items.get("Kind") == "Database":
1023
+ database_name = temp_accessor.items["Name"]
1024
+
1025
+ if temp_accessor.items.get("Kind") == "Schema":
1026
+ schema_name = temp_accessor.items["Name"]
1027
+
1028
+ if temp_accessor.items.get("Kind") == "Table":
1029
+ table_name = temp_accessor.items["Name"]
1030
+
1031
+ if temp_accessor.next is not None:
1032
+ temp_accessor = temp_accessor.next
1033
+ else:
1034
+ break
1035
+
1036
+ if (
1037
+ database_name is not None
1038
+ and schema_name is not None
1039
+ and table_name is not None
1040
+ ):
1041
+ qualified_table_name = f"{database_name}.{schema_name}.{table_name}"
1042
+ elif database_name is not None and table_name is not None:
1043
+ qualified_table_name = f"{database_name}.{table_name}"
1044
+
1045
+ if not qualified_table_name:
1046
+ self.reporter.warning(
1047
+ title="Can not determine qualified table name",
1048
+ message="Can not determine qualified table name for ODBC data source. Skipping Lineage creation.",
1049
+ context=f"table-name={self.table.full_name}, data-platform={data_platform}",
1050
+ )
1051
+ logger.warning(
1052
+ f"Can not determine qualified table name for ODBC data source {data_platform} "
1053
+ f"table {self.table.full_name}."
1054
+ )
1055
+ return Lineage.empty()
1056
+
1057
+ logger.debug(
1058
+ f"ODBC Platform {data_platform} found qualified table name {qualified_table_name}"
1059
+ )
1060
+
1061
+ urn = make_urn(
1062
+ config=self.config,
1063
+ platform_instance_resolver=self.platform_instance_resolver,
1064
+ data_platform_pair=platform_pair,
1065
+ server=server_name,
1066
+ qualified_table_name=qualified_table_name,
1067
+ )
1068
+
1069
+ column_lineage = self.create_table_column_lineage(urn)
1070
+
1071
+ return Lineage(
1072
+ upstreams=[
1073
+ DataPlatformTable(
1074
+ data_platform_pair=platform_pair,
1075
+ urn=urn,
1076
+ )
1077
+ ],
1078
+ column_lineage=column_lineage,
1079
+ )
1080
+
1081
+ @staticmethod
1082
+ def create_platform_pair(
1083
+ data_platform: str, powerbi_platform: str
1084
+ ) -> DataPlatformPair:
1085
+ return DataPlatformPair(data_platform, powerbi_platform)
1086
+
1087
+ def get_platform_pair(self) -> DataPlatformPair:
1088
+ return SupportedDataPlatform.ODBC.value
1089
+
1090
+
943
1091
  class SupportedPattern(Enum):
944
1092
  DATABRICKS_QUERY = (
945
1093
  DatabricksLineage,
@@ -991,6 +1139,11 @@ class SupportedPattern(Enum):
991
1139
  FunctionName.NATIVE_QUERY,
992
1140
  )
993
1141
 
1142
+ ODBC = (
1143
+ OdbcLineage,
1144
+ FunctionName.ODBC_DATA_ACCESS,
1145
+ )
1146
+
994
1147
  def handler(self) -> Type[AbstractLineage]:
995
1148
  return self.value[0]
996
1149
 
@@ -301,6 +301,7 @@ class SnowflakeV2Config(
301
301
  default=AllowDenyPattern.allow_all(),
302
302
  description=(
303
303
  "List of regex patterns for structured properties to include in ingestion."
304
+ " Applied to tags with form `<database>.<schema>.<tag_name>`."
304
305
  " Only used if `extract_tags` and `extract_tags_as_structured_properties` are enabled."
305
306
  ),
306
307
  )
@@ -23,6 +23,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import AuditStamp
23
23
  from datahub.metadata.com.linkedin.pegasus2avro.structured import (
24
24
  StructuredPropertyDefinition,
25
25
  )
26
+ from datahub.metadata.schema_classes import ChangeTypeClass
26
27
  from datahub.metadata.urns import (
27
28
  ContainerUrn,
28
29
  DatasetUrn,
@@ -81,7 +82,7 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
81
82
  def create_structured_property_templates(self) -> Iterable[MetadataWorkUnit]:
82
83
  for tag in self.data_dictionary.get_all_tags():
83
84
  if not self.config.structured_property_pattern.allowed(
84
- tag.tag_identifier()
85
+ tag._id_prefix_as_str()
85
86
  ):
86
87
  continue
87
88
  if self.config.extract_tags_as_structured_properties:
@@ -111,6 +112,8 @@ class SnowflakeTagExtractor(SnowflakeCommonMixin):
111
112
  yield MetadataChangeProposalWrapper(
112
113
  entityUrn=urn,
113
114
  aspect=aspect,
115
+ changeType=ChangeTypeClass.CREATE,
116
+ headers={"If-None-Match": "*"},
114
117
  ).as_workunit()
115
118
 
116
119
  def _get_tags_on_object_with_propagation(
@@ -2,7 +2,7 @@ import dataclasses
2
2
  import json
3
3
  import re
4
4
  from collections import defaultdict
5
- from typing import Any, Dict, List, Sequence, Set, Tuple, Union
5
+ from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
6
6
 
7
7
  import deepdiff.serialization
8
8
  import yaml
@@ -34,6 +34,7 @@ class AspectForDiff:
34
34
  aspect_name: str
35
35
  aspect: Dict[str, Any] = dataclasses.field(hash=False)
36
36
  delta_info: "DeltaInfo" = dataclasses.field(hash=False, repr=False)
37
+ headers: Optional[Dict[str, str]] = dataclasses.field(default=None, hash=False)
37
38
 
38
39
  @classmethod
39
40
  def create_from_mcp(cls, idx: int, obj: Dict[str, Any]) -> "AspectForDiff":
@@ -44,6 +45,7 @@ class AspectForDiff:
44
45
  aspect_name=obj["aspectName"],
45
46
  aspect=aspect.get("json", aspect),
46
47
  delta_info=DeltaInfo(idx=idx, original=obj),
48
+ headers=obj.get("headers"),
47
49
  )
48
50
 
49
51
  def __repr__(self):
@@ -240,9 +242,12 @@ class MCPDiff:
240
242
  s.append(serialize_aspect(ga.aspect))
241
243
  for (i, old, new), diffs in aspect_diffs.aspects_changed.items():
242
244
  s.append(self.report_aspect(old, i, "changed") + ":")
245
+
246
+ print_aspects = False
243
247
  for diff_level in diffs:
244
248
  s.append(self.report_diff_level(diff_level, i))
245
- if verbose:
249
+ print_aspects |= self.is_diff_level_on_aspect(diff_level)
250
+ if verbose and print_aspects:
246
251
  s.append(f"Old aspect:\n{serialize_aspect(old.aspect)}")
247
252
  s.append(f"New aspect:\n{serialize_aspect(new.aspect)}")
248
253
 
@@ -271,6 +276,14 @@ class MCPDiff:
271
276
  f"root[{idx}].", ""
272
277
  )
273
278
 
279
+ @staticmethod
280
+ def is_diff_level_on_aspect(diff: DiffLevel) -> bool:
281
+ skip_print_fields = ["changeType", "headers"]
282
+ try:
283
+ return diff.path(output_format="list")[1] not in skip_print_fields
284
+ except IndexError:
285
+ return True
286
+
274
287
 
275
288
  def serialize_aspect(aspect: Union[AspectForDiff, Dict[str, Any]]) -> str:
276
289
  if isinstance(aspect, AspectForDiff): # Unpack aspect