acryl-datahub 1.3.0rc4__py3-none-any.whl → 1.3.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.3.0rc4.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.3.0.1rc1.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=IE1A1EIFh0zNNTxCNZuHp84ueLDs5dQeRmsZJgB-YLw,321
4
+ datahub/_version.py,sha256=R7AO3BvsCQaRcQe_NV5EtywuCZj3jLQL4SqalUnrmTc,323
5
5
  datahub/entrypoints.py,sha256=9Qf-37rNnTzbGlx8S75OCDazIclFp6zWNcCEL1zCZto,9015
6
6
  datahub/errors.py,sha256=p5rFAdAGVCk4Lqolol1YvthceadUSwpaCxLXRcyCCFQ,676
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -100,7 +100,7 @@ datahub/cli/specific/structuredproperties_cli.py,sha256=rgx8fhI7WYq8QLwIttkMysX7
100
100
  datahub/cli/specific/user_cli.py,sha256=HaASGg25b82Q18hKyOn98gPoQfmj1gW0utDMs0iR5WY,1897
101
101
  datahub/configuration/__init__.py,sha256=5TN3a7CWNsLRHpdj-sv2bxKWF2IslvJwE6EpNMFrIS4,123
102
102
  datahub/configuration/_config_enum.py,sha256=ul2hr5gMmdLvBINicFkMNMi1ApmnmZSwNdUYYted5nk,1447
103
- datahub/configuration/common.py,sha256=HvMrc-q2ZocbAHj5KgBHUaQOjdWLvk_wRGDkTUVbhtU,11742
103
+ datahub/configuration/common.py,sha256=i34Ec4tkTc2iy0ay82ZqPUeapUR3-ZKItR3gzI-0zTg,11730
104
104
  datahub/configuration/config_loader.py,sha256=hRzPFxkz-w9IqkpSa5vwCzSra1p49DyfeJNeyqGa8-4,6827
105
105
  datahub/configuration/connection_resolver.py,sha256=UsnV1_X8yivOykiifllkoKRn19eO6j_NTBWHC2Ob5Xg,1625
106
106
  datahub/configuration/datetimes.py,sha256=nayNc0mmlVKH6oVv9ud6C1dDUiZPGabW-YZxvrkosPg,2870
@@ -334,7 +334,7 @@ datahub/ingestion/source/excel/report.py,sha256=oEkeI8J6is7zB9iz4RqASu_-Q5xl36lA
334
334
  datahub/ingestion/source/excel/source.py,sha256=w_vOz4UD7BcXBBDKoo81_6-QFeOPITuXqkfjIMHCQj4,23827
335
335
  datahub/ingestion/source/excel/util.py,sha256=YYmadYuCiT-4_MfQM0YSE7wuDcE0k8o2KrlOKM9Z6eI,406
336
336
  datahub/ingestion/source/fivetran/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
337
- datahub/ingestion/source/fivetran/config.py,sha256=vNmnQM3oekr2dOLPria-wjCLmp27bcYypIfoA6xx5k8,10290
337
+ datahub/ingestion/source/fivetran/config.py,sha256=6yriUMtTPMZUHqbZ9gzyFduPVt6CxzirdYSg4k-ziYI,10285
338
338
  datahub/ingestion/source/fivetran/data_classes.py,sha256=ecdUJH5BEze0yv-uFpKWPNaNmV1gORDA2XMFk0zhcBw,595
339
339
  datahub/ingestion/source/fivetran/fivetran.py,sha256=Up5wbLk7hBk9b0pqcHwW6b0H52UJj90cmLhn0QJeZ4g,14416
340
340
  datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=-ibtfgxFv08P5_X5PVqV4CocxAjRWmY858esQL5OaAQ,13697
@@ -568,7 +568,8 @@ datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=wsVD0SkGUwb-H9
568
568
  datahub/ingestion/source/tableau/tableau_validation.py,sha256=Hjbfc1AMIkGgzo5ffWXtNRjrxSxzHvw7-dYZDt4d3WE,1819
569
569
  datahub/ingestion/source/unity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
570
570
  datahub/ingestion/source/unity/analyze_profiler.py,sha256=2pqkFY30CfN4aHgFZZntjeG0hNhBytZJvXC13VfTc1I,4689
571
- datahub/ingestion/source/unity/config.py,sha256=A5lkm-koBDOnBSSCTzOvYlsSIxT-xbK3NcJMS6xJMaQ,20914
571
+ datahub/ingestion/source/unity/config.py,sha256=lHvr-PGVcZ0P_2e0RuwmfSRlQRJ81astx4hQZkNrX_k,18713
572
+ datahub/ingestion/source/unity/connection.py,sha256=iCsQhZ1vxzv1qQKTl_sFUZdmBLLIrNdu2X2V8hT7IGI,2441
572
573
  datahub/ingestion/source/unity/connection_test.py,sha256=Dwpz4AIc6ZDwq6pWmRCSCuDUgNjPP_bVAVJumgAAS4w,2661
573
574
  datahub/ingestion/source/unity/ge_profiler.py,sha256=NBRHZceq-f95iUn7u0h7cgcd9nAc48Aa-lmp_BqE0As,8409
574
575
  datahub/ingestion/source/unity/hive_metastore_proxy.py,sha256=IAWWJjaW0si_UF52Se2D7wmdYRY_afUG4QlVmQu6xaw,15351
@@ -594,7 +595,7 @@ datahub/ingestion/source_config/csv_enricher.py,sha256=IROxxfFJA56dHkmmbjjhb7h1p
594
595
  datahub/ingestion/source_config/operation_config.py,sha256=hxF2RM0jk0HUPXYiliMniXBC-wz-ZPcs90ZGLfHT8rE,3924
595
596
  datahub/ingestion/source_config/pulsar.py,sha256=zi3QTAw8CzzuwXgU-GUCuLyneT5pxHsLqZFyd15ECYs,5604
596
597
  datahub/ingestion/source_report/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
597
- datahub/ingestion/source_report/ingestion_stage.py,sha256=0MY39QetRovYd1iBNSy0OW11YyaOsPaqhQi-1svmAcY,3106
598
+ datahub/ingestion/source_report/ingestion_stage.py,sha256=7r_WUQ3mHH8nAlVu7QUZcj8CwTRORXRhAaATTgkVZ8c,3111
598
599
  datahub/ingestion/source_report/pulsar.py,sha256=f6CMNw8TyPp3tuSGsLLPEhSvoQLXwxtaaM6GmNvsANU,1119
599
600
  datahub/ingestion/source_report/time_window.py,sha256=9yI5l2S1DcF7ClvUHLeN8m62I5vlhV9k-aQqSZh2l7w,229
600
601
  datahub/ingestion/transformer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -1018,7 +1019,7 @@ datahub/sql_parsing/sql_parsing_aggregator.py,sha256=kxxSVe3YNoz_T2OG6-F30ZuXNSX
1018
1019
  datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
1019
1020
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
1020
1021
  datahub/sql_parsing/sqlglot_lineage.py,sha256=Zli78TtN8ow-uqNl_oloWT642a5bDGn22-FF0O0tqrs,66658
1021
- datahub/sql_parsing/sqlglot_utils.py,sha256=zH8V9tAcSVO7Y8I3sIKPhs0D_9HzdNBlranBDmk1NB4,15454
1022
+ datahub/sql_parsing/sqlglot_utils.py,sha256=2CwrnDsmjcDlwjdBbmaiTVbWZjEAm3fr4ulEvTr3cZQ,15343
1022
1023
  datahub/sql_parsing/tool_meta_extractor.py,sha256=5JsLPcKjuXSrPGxNIhRvX72dFPmlV33-hyvhJwlWxCY,7543
1023
1024
  datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
1024
1025
  datahub/telemetry/stats.py,sha256=TwaQisQlD2Bk0uw__pP6u3Ovz9r-Ip4pCwpnto4r5e0,959
@@ -1125,8 +1126,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1125
1126
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1126
1127
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1127
1128
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1128
- acryl_datahub-1.3.0rc4.dist-info/METADATA,sha256=nsoIeC_TnJAbmM1wNUIO_dpEiFFZR2xr1AiwJzuJnqk,184332
1129
- acryl_datahub-1.3.0rc4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1130
- acryl_datahub-1.3.0rc4.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
1131
- acryl_datahub-1.3.0rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1132
- acryl_datahub-1.3.0rc4.dist-info/RECORD,,
1129
+ acryl_datahub-1.3.0.1rc1.dist-info/METADATA,sha256=7XyrCvdsnuEzzHSrBD4-gyr723Kq-Nj2Jawvh5sOz1M,184432
1130
+ acryl_datahub-1.3.0.1rc1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
1131
+ acryl_datahub-1.3.0.1rc1.dist-info/entry_points.txt,sha256=pzsBoTx-D-iTcmpX8oCGCyzlHP2112EygUMzZWz56M8,10105
1132
+ acryl_datahub-1.3.0.1rc1.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1133
+ acryl_datahub-1.3.0.1rc1.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.3.0rc4"
3
+ __version__ = "1.3.0.1rc1"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -173,11 +173,11 @@ class ConnectionModel(BaseModel):
173
173
  """Represents the config associated with a connection"""
174
174
 
175
175
  class Config:
176
- if PYDANTIC_VERSION_2: # noqa: SIM108
176
+ if PYDANTIC_VERSION_2:
177
177
  extra = "allow"
178
178
  else:
179
179
  extra = Extra.allow
180
- underscore_attrs_are_private = True
180
+ underscore_attrs_are_private = True
181
181
 
182
182
 
183
183
  class TransformerSemantics(ConfigEnum):
@@ -29,9 +29,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
29
29
  from datahub.ingestion.source.state.stateful_ingestion_base import (
30
30
  StatefulIngestionConfigBase,
31
31
  )
32
- from datahub.ingestion.source.unity.config import (
33
- UnityCatalogConnectionConfig,
34
- )
32
+ from datahub.ingestion.source.unity.connection import UnityCatalogConnectionConfig
35
33
  from datahub.utilities.lossy_collections import LossyList
36
34
  from datahub.utilities.perf_timer import PerfTimer
37
35
 
@@ -2,7 +2,6 @@ import logging
2
2
  import os
3
3
  from datetime import datetime, timedelta, timezone
4
4
  from typing import Any, Dict, List, Optional, Union
5
- from urllib.parse import urlparse
6
5
 
7
6
  import pydantic
8
7
  from pydantic import Field
@@ -20,10 +19,8 @@ from datahub.configuration.source_common import (
20
19
  )
21
20
  from datahub.configuration.validate_field_removal import pydantic_removed_field
22
21
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
23
- from datahub.ingestion.source.ge_data_profiler import DATABRICKS
24
22
  from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
25
23
  from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
26
- from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
27
24
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
28
25
  StatefulStaleMetadataRemovalConfig,
29
26
  )
@@ -31,6 +28,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
31
28
  StatefulIngestionConfigBase,
32
29
  StatefulProfilingConfigMixin,
33
30
  )
31
+ from datahub.ingestion.source.unity.connection import UnityCatalogConnectionConfig
34
32
  from datahub.ingestion.source.usage.usage_common import BaseUsageConfig
35
33
  from datahub.ingestion.source_config.operation_config import (
36
34
  OperationConfig,
@@ -132,55 +130,6 @@ class UnityCatalogGEProfilerConfig(UnityCatalogProfilerConfig, GEProfilingConfig
132
130
  )
133
131
 
134
132
 
135
- class UnityCatalogConnectionConfig(ConfigModel):
136
- """
137
- Configuration for connecting to Databricks Unity Catalog.
138
- Contains only connection-related fields that can be reused across different sources.
139
- """
140
-
141
- scheme: str = DATABRICKS
142
- token: str = pydantic.Field(description="Databricks personal access token")
143
- workspace_url: str = pydantic.Field(
144
- description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
145
- )
146
- warehouse_id: Optional[str] = pydantic.Field(
147
- default=None,
148
- description=(
149
- "SQL Warehouse id, for running queries. Must be explicitly provided to enable SQL-based features. "
150
- "Required for the following features that need SQL access: "
151
- "1) Tag extraction (include_tags=True) - queries system.information_schema.tags "
152
- "2) Hive Metastore catalog (include_hive_metastore=True) - queries legacy hive_metastore catalog "
153
- "3) System table lineage (lineage_data_source=SYSTEM_TABLES) - queries system.access.table_lineage/column_lineage "
154
- "4) Data profiling (profiling.enabled=True) - runs SELECT/ANALYZE queries on tables. "
155
- "When warehouse_id is missing, these features will be automatically disabled (with warnings) to allow ingestion to continue."
156
- ),
157
- )
158
-
159
- extra_client_options: Dict[str, Any] = Field(
160
- default={},
161
- description="Additional options to pass to Databricks SQLAlchemy client.",
162
- )
163
-
164
- def __init__(self, **data: Any):
165
- super().__init__(**data)
166
-
167
- def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
168
- uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
169
- if database:
170
- uri_opts["catalog"] = database
171
- return make_sqlalchemy_uri(
172
- scheme=self.scheme,
173
- username="token",
174
- password=self.token,
175
- at=urlparse(self.workspace_url).netloc,
176
- db=database,
177
- uri_opts=uri_opts,
178
- )
179
-
180
- def get_options(self) -> dict:
181
- return self.extra_client_options
182
-
183
-
184
133
  class UnityCatalogSourceConfig(
185
134
  UnityCatalogConnectionConfig,
186
135
  SQLCommonConfig,
@@ -0,0 +1,61 @@
1
+ """Databricks Unity Catalog connection configuration."""
2
+
3
+ from typing import Any, Dict, Optional
4
+ from urllib.parse import urlparse
5
+
6
+ import pydantic
7
+ from pydantic import Field
8
+
9
+ from datahub.configuration.common import ConfigModel
10
+ from datahub.ingestion.source.sql.sqlalchemy_uri import make_sqlalchemy_uri
11
+
12
+ DATABRICKS = "databricks"
13
+
14
+
15
+ class UnityCatalogConnectionConfig(ConfigModel):
16
+ """
17
+ Configuration for connecting to Databricks Unity Catalog.
18
+ Contains only connection-related fields that can be reused across different sources.
19
+ """
20
+
21
+ scheme: str = DATABRICKS
22
+ token: str = pydantic.Field(description="Databricks personal access token")
23
+ workspace_url: str = pydantic.Field(
24
+ description="Databricks workspace url. e.g. https://my-workspace.cloud.databricks.com"
25
+ )
26
+ warehouse_id: Optional[str] = pydantic.Field(
27
+ default=None,
28
+ description=(
29
+ "SQL Warehouse id, for running queries. Must be explicitly provided to enable SQL-based features. "
30
+ "Required for the following features that need SQL access: "
31
+ "1) Tag extraction (include_tags=True) - queries system.information_schema.tags "
32
+ "2) Hive Metastore catalog (include_hive_metastore=True) - queries legacy hive_metastore catalog "
33
+ "3) System table lineage (lineage_data_source=SYSTEM_TABLES) - queries system.access.table_lineage/column_lineage "
34
+ "4) Data profiling (profiling.enabled=True) - runs SELECT/ANALYZE queries on tables. "
35
+ "When warehouse_id is missing, these features will be automatically disabled (with warnings) to allow ingestion to continue."
36
+ ),
37
+ )
38
+
39
+ extra_client_options: Dict[str, Any] = Field(
40
+ default={},
41
+ description="Additional options to pass to Databricks SQLAlchemy client.",
42
+ )
43
+
44
+ def __init__(self, **data: Any):
45
+ super().__init__(**data)
46
+
47
+ def get_sql_alchemy_url(self, database: Optional[str] = None) -> str:
48
+ uri_opts = {"http_path": f"/sql/1.0/warehouses/{self.warehouse_id}"}
49
+ if database:
50
+ uri_opts["catalog"] = database
51
+ return make_sqlalchemy_uri(
52
+ scheme=self.scheme,
53
+ username="token",
54
+ password=self.token,
55
+ at=urlparse(self.workspace_url).netloc,
56
+ db=database,
57
+ uri_opts=uri_opts,
58
+ )
59
+
60
+ def get_options(self) -> dict:
61
+ return self.extra_client_options
@@ -4,7 +4,6 @@ from contextlib import AbstractContextManager
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import datetime, timezone
6
6
  from enum import Enum
7
- from typing import Tuple
8
7
 
9
8
  from datahub.utilities.perf_timer import PerfTimer
10
9
  from datahub.utilities.stats_collections import TopKDict
@@ -38,9 +37,7 @@ class IngestionStageReport:
38
37
  ingestion_high_stage_seconds: dict[IngestionHighStage, float] = field(
39
38
  default_factory=lambda: defaultdict(float)
40
39
  )
41
- ingestion_stage_durations: TopKDict[Tuple[IngestionHighStage, str], float] = field(
42
- default_factory=TopKDict
43
- )
40
+ ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
44
41
 
45
42
  def new_stage(
46
43
  self, stage: str, high_stage: IngestionHighStage = IngestionHighStage._UNDEFINED
@@ -81,9 +78,9 @@ class IngestionStageContext(AbstractContextManager):
81
78
  f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
82
79
  stacklevel=2,
83
80
  )
84
- self._report.ingestion_stage_durations[
85
- (self._high_stage, self._ingestion_stage)
86
- ] = elapsed
81
+ # Store tuple as string to avoid serialization errors
82
+ key = f"({self._high_stage.value}, {self._ingestion_stage})"
83
+ self._report.ingestion_stage_durations[key] = elapsed
87
84
  else:
88
85
  logger.info(
89
86
  f"Time spent in stage <{self._high_stage.value}>: {elapsed} seconds",
@@ -40,9 +40,6 @@ def _get_dialect_str(platform: str) -> str:
40
40
  # let the fuzzy resolution logic handle it.
41
41
  # MariaDB is a fork of MySQL, so we reuse the same dialect.
42
42
  return "mysql, normalization_strategy = lowercase"
43
- # Dremio is based upon drill. Not 100% compatibility
44
- elif platform == "dremio":
45
- return "drill"
46
43
  else:
47
44
  return platform
48
45