databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks/labs/lakebridge/__about__.py +1 -1
- databricks/labs/lakebridge/analyzer/__init__.py +0 -0
- databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
- databricks/labs/lakebridge/base_install.py +24 -3
- databricks/labs/lakebridge/cli.py +19 -53
- databricks/labs/lakebridge/contexts/application.py +7 -0
- databricks/labs/lakebridge/deployment/job.py +2 -2
- databricks/labs/lakebridge/helpers/file_utils.py +36 -0
- databricks/labs/lakebridge/install.py +187 -157
- databricks/labs/lakebridge/reconcile/compare.py +70 -33
- databricks/labs/lakebridge/reconcile/connectors/data_source.py +19 -0
- databricks/labs/lakebridge/reconcile/connectors/databricks.py +11 -1
- databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
- databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
- databricks/labs/lakebridge/reconcile/connectors/oracle.py +11 -1
- databricks/labs/lakebridge/reconcile/connectors/snowflake.py +14 -2
- databricks/labs/lakebridge/reconcile/connectors/tsql.py +27 -2
- databricks/labs/lakebridge/reconcile/constants.py +4 -3
- databricks/labs/lakebridge/reconcile/execute.py +9 -810
- databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
- databricks/labs/lakebridge/reconcile/query_builder/base.py +3 -7
- databricks/labs/lakebridge/reconcile/recon_config.py +3 -0
- databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
- databricks/labs/lakebridge/reconcile/reconciliation.py +508 -0
- databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
- databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +98 -0
- databricks/labs/lakebridge/reconcile/trigger_recon_service.py +253 -0
- databricks/labs/lakebridge/reconcile/utils.py +38 -0
- databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +45 -60
- databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
- databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/METADATA +1 -1
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/RECORD +37 -28
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/WHEEL +0 -0
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/entry_points.txt +0 -0
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/LICENSE +0 -0
- {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/NOTICE +0 -0
@@ -1,40 +1,41 @@
|
|
1
1
|
import re
|
2
2
|
import abc
|
3
3
|
import dataclasses
|
4
|
-
import shutil
|
5
|
-
from json import loads, dump
|
6
4
|
import logging
|
7
5
|
import os
|
8
|
-
|
9
|
-
from subprocess import run, CalledProcessError
|
6
|
+
import shutil
|
10
7
|
import sys
|
11
|
-
|
12
|
-
from urllib import request
|
13
|
-
from urllib.error import URLError, HTTPError
|
8
|
+
import venv
|
14
9
|
import webbrowser
|
10
|
+
import xml.etree.ElementTree as ET
|
15
11
|
from datetime import datetime, timezone
|
12
|
+
from json import loads, dump
|
16
13
|
from pathlib import Path
|
17
|
-
import
|
14
|
+
from shutil import rmtree, move
|
15
|
+
from subprocess import run, CalledProcessError
|
16
|
+
from typing import Any, Literal, cast
|
17
|
+
from urllib import request
|
18
|
+
from urllib.error import URLError, HTTPError
|
18
19
|
from zipfile import ZipFile
|
19
20
|
|
20
|
-
from databricks.labs.blueprint.installation import Installation, JsonValue
|
21
|
-
from databricks.labs.blueprint.installation import SerdeError
|
21
|
+
from databricks.labs.blueprint.installation import Installation, JsonValue, SerdeError
|
22
22
|
from databricks.labs.blueprint.installer import InstallState
|
23
23
|
from databricks.labs.blueprint.tui import Prompts
|
24
24
|
from databricks.labs.blueprint.wheels import ProductInfo
|
25
25
|
from databricks.sdk import WorkspaceClient
|
26
26
|
from databricks.sdk.errors import NotFound, PermissionDenied
|
27
27
|
|
28
|
+
from databricks.labs.lakebridge.__about__ import __version__
|
28
29
|
from databricks.labs.lakebridge.config import (
|
29
|
-
TranspileConfig,
|
30
|
-
ReconcileConfig,
|
31
30
|
DatabaseConfig,
|
31
|
+
ReconcileConfig,
|
32
32
|
LakebridgeConfiguration,
|
33
33
|
ReconcileMetadataConfig,
|
34
|
+
TranspileConfig,
|
34
35
|
)
|
36
|
+
from databricks.labs.lakebridge.contexts.application import ApplicationContext
|
35
37
|
from databricks.labs.lakebridge.deployment.configurator import ResourceConfigurator
|
36
38
|
from databricks.labs.lakebridge.deployment.installation import WorkspaceInstallation
|
37
|
-
from databricks.labs.lakebridge.helpers.file_utils import chdir
|
38
39
|
from databricks.labs.lakebridge.reconcile.constants import ReconReportType, ReconSourceType
|
39
40
|
from databricks.labs.lakebridge.transpiler.repository import TranspilerRepository
|
40
41
|
|
@@ -43,9 +44,70 @@ logger = logging.getLogger(__name__)
|
|
43
44
|
TRANSPILER_WAREHOUSE_PREFIX = "Lakebridge Transpiler Validation"
|
44
45
|
|
45
46
|
|
47
|
+
class _PathBackup:
|
48
|
+
"""A context manager to preserve a path before performing an operation, and optionally restore it afterwards."""
|
49
|
+
|
50
|
+
def __init__(self, path: Path) -> None:
|
51
|
+
self._path = path
|
52
|
+
self._backup_path: Path | None = None
|
53
|
+
self._finished = False
|
54
|
+
|
55
|
+
def __enter__(self) -> "_PathBackup":
|
56
|
+
self.start()
|
57
|
+
return self
|
58
|
+
|
59
|
+
def start(self) -> None:
|
60
|
+
"""Start the backup process by creating a backup of the path, if it already exists."""
|
61
|
+
backup_path = self._path.with_name(f"{self._path.name}-saved")
|
62
|
+
if backup_path.exists():
|
63
|
+
logger.debug(f"Existing backup found, removing: {backup_path}")
|
64
|
+
rmtree(backup_path)
|
65
|
+
if self._path.exists():
|
66
|
+
logger.debug(f"Backing up existing path: {self._path} -> {backup_path}")
|
67
|
+
os.rename(self._path, backup_path)
|
68
|
+
self._backup_path = backup_path
|
69
|
+
else:
|
70
|
+
self._backup_path = None
|
71
|
+
|
72
|
+
def rollback(self) -> None:
|
73
|
+
"""Rollback the operation by restoring the backup path, if it exists."""
|
74
|
+
assert not self._finished, "Can only rollback/commit once."
|
75
|
+
logger.debug(f"Removing path: {self._path}")
|
76
|
+
rmtree(self._path)
|
77
|
+
if self._backup_path is not None:
|
78
|
+
logger.debug(f"Restoring previous path: {self._backup_path} -> {self._path}")
|
79
|
+
os.rename(self._backup_path, self._path)
|
80
|
+
self._backup_path = None
|
81
|
+
self._finished = True
|
82
|
+
|
83
|
+
def commit(self) -> None:
|
84
|
+
"""Commit the operation by removing the backup path, if it exists."""
|
85
|
+
assert not self._finished, "Can only rollback/commit once."
|
86
|
+
if self._backup_path is not None:
|
87
|
+
logger.debug(f"Removing backup path: {self._backup_path}")
|
88
|
+
rmtree(self._backup_path)
|
89
|
+
self._backup_path = None
|
90
|
+
self._finished = True
|
91
|
+
|
92
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]:
|
93
|
+
if not self._finished:
|
94
|
+
# Automatically commit or rollback based on whether an exception is underway.
|
95
|
+
if exc_val is None:
|
96
|
+
self.commit()
|
97
|
+
else:
|
98
|
+
self.rollback()
|
99
|
+
return False # Do not suppress any exception underway
|
100
|
+
|
101
|
+
|
46
102
|
class TranspilerInstaller(abc.ABC):
|
47
|
-
|
103
|
+
|
104
|
+
# TODO: Remove these properties when post-install is removed.
|
105
|
+
_install_path: Path
|
106
|
+
"""The path where the transpiler is being installed, once this starts."""
|
107
|
+
|
108
|
+
def __init__(self, repository: TranspilerRepository, product_name: str) -> None:
|
48
109
|
self._repository = repository
|
110
|
+
self._product_name = product_name
|
49
111
|
|
50
112
|
_version_pattern = re.compile(r"[_-](\d+(?:[.\-_]\w*\d+)+)")
|
51
113
|
|
@@ -75,9 +137,44 @@ class TranspilerInstaller(abc.ABC):
|
|
75
137
|
dump(version_data, f)
|
76
138
|
f.write("\n")
|
77
139
|
|
140
|
+
def _install_version_with_backup(self, version: str) -> Path | None:
|
141
|
+
"""Install a specific version of the transpiler, with backup handling."""
|
142
|
+
logger.info(f"Installing Databricks {self._product_name} transpiler (v{version})")
|
143
|
+
product_path = self._repository.transpilers_path() / self._product_name
|
144
|
+
with _PathBackup(product_path) as backup:
|
145
|
+
self._install_path = product_path / "lib"
|
146
|
+
self._install_path.mkdir(parents=True, exist_ok=True)
|
147
|
+
try:
|
148
|
+
result = self._install_version(version)
|
149
|
+
except (CalledProcessError, KeyError, ValueError) as e:
|
150
|
+
# Warning: if you end up here under the IntelliJ/PyCharm debugger, it can be because the debugger is
|
151
|
+
# trying to inject itself into the subprocess. Try disabling:
|
152
|
+
# Settings | Build, Execution, Deployment | Python Debugger | Attach to subprocess automatically while debugging
|
153
|
+
# Note: Subprocess output is not captured, and should already be visible in the console.
|
154
|
+
logger.error(f"Failed to install {self._product_name} transpiler (v{version})", exc_info=e)
|
155
|
+
result = False
|
156
|
+
|
157
|
+
if result:
|
158
|
+
logger.info(f"Successfully installed {self._product_name} transpiler (v{version})")
|
159
|
+
self._store_product_state(product_path=product_path, version=version)
|
160
|
+
backup.commit()
|
161
|
+
return product_path
|
162
|
+
backup.rollback()
|
163
|
+
return None
|
164
|
+
|
165
|
+
@abc.abstractmethod
|
166
|
+
def _install_version(self, version: str) -> bool:
|
167
|
+
"""Install a specific version of the transpiler, returning True if successful."""
|
168
|
+
|
78
169
|
|
79
170
|
class WheelInstaller(TranspilerInstaller):
|
80
171
|
|
172
|
+
_venv_exec_cmd: Path
|
173
|
+
"""Once created, the command to run the virtual environment's Python executable."""
|
174
|
+
|
175
|
+
_site_packages: Path
|
176
|
+
"""Once created, the path to the site-packages directory in the virtual environment."""
|
177
|
+
|
81
178
|
@classmethod
|
82
179
|
def get_latest_artifact_version_from_pypi(cls, product_name: str) -> str | None:
|
83
180
|
try:
|
@@ -96,8 +193,7 @@ class WheelInstaller(TranspilerInstaller):
|
|
96
193
|
pypi_name: str,
|
97
194
|
artifact: Path | None = None,
|
98
195
|
) -> None:
|
99
|
-
super().__init__(repository)
|
100
|
-
self._product_name = product_name
|
196
|
+
super().__init__(repository, product_name)
|
101
197
|
self._pypi_name = pypi_name
|
102
198
|
self._artifact = artifact
|
103
199
|
|
@@ -118,116 +214,45 @@ class WheelInstaller(TranspilerInstaller):
|
|
118
214
|
if installed_version == latest_version:
|
119
215
|
logger.info(f"{self._pypi_name} v{latest_version} already installed")
|
120
216
|
return None
|
121
|
-
return self.
|
122
|
-
|
123
|
-
def _install_latest_version(self, version: str) -> Path | None:
|
124
|
-
logger.info(f"Installing Databricks {self._product_name} transpiler v{version}")
|
125
|
-
self._product_path = self._repository.transpilers_path() / self._product_name
|
126
|
-
backup_path = Path(f"{self._product_path!s}-saved")
|
127
|
-
if self._product_path.exists():
|
128
|
-
os.rename(self._product_path, backup_path)
|
129
|
-
self._install_path = self._product_path / "lib"
|
130
|
-
self._install_path.mkdir(parents=True, exist_ok=True)
|
131
|
-
try:
|
132
|
-
result = self._unsafe_install_latest_version(version)
|
133
|
-
logger.info(f"Successfully installed {self._pypi_name} v{version}")
|
134
|
-
if backup_path.exists():
|
135
|
-
rmtree(backup_path)
|
136
|
-
return result
|
137
|
-
except (CalledProcessError, ValueError) as e:
|
138
|
-
logger.error(f"Failed to install {self._pypi_name} v{version}", exc_info=e)
|
139
|
-
rmtree(self._product_path)
|
140
|
-
if backup_path.exists():
|
141
|
-
os.rename(backup_path, self._product_path)
|
142
|
-
return None
|
217
|
+
return self._install_version_with_backup(latest_version)
|
143
218
|
|
144
|
-
def
|
219
|
+
def _install_version(self, version: str) -> bool:
|
145
220
|
self._create_venv()
|
146
221
|
self._install_with_pip()
|
147
222
|
self._copy_lsp_resources()
|
148
|
-
return self._post_install(
|
223
|
+
return self._post_install() is not None
|
149
224
|
|
150
225
|
def _create_venv(self) -> None:
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
cmd_line = f"{sys.executable} -m venv .venv"
|
159
|
-
completed = run(cmd_line, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, shell=True, check=False)
|
160
|
-
if completed.returncode:
|
161
|
-
logger.error(f"Failed to create venv, error code: {completed.returncode}")
|
162
|
-
if completed.stdout:
|
163
|
-
for line in completed.stdout:
|
164
|
-
logger.error(line)
|
165
|
-
if completed.stderr:
|
166
|
-
for line in completed.stderr:
|
167
|
-
logger.error(line)
|
168
|
-
completed.check_returncode()
|
169
|
-
self._venv = self._install_path / ".venv"
|
170
|
-
self._site_packages = self._locate_site_packages()
|
171
|
-
|
172
|
-
def _locate_site_packages(self) -> Path:
|
173
|
-
# can't use sysconfig because it only works for currently running python
|
174
|
-
if sys.platform == "win32":
|
175
|
-
return self._locate_site_packages_windows()
|
176
|
-
return self._locate_site_packages_linux_or_macos()
|
177
|
-
|
178
|
-
def _locate_site_packages_windows(self) -> Path:
|
179
|
-
packages = self._venv / "Lib" / "site-packages"
|
180
|
-
if packages.exists():
|
181
|
-
return packages
|
182
|
-
raise ValueError(f"Could not locate 'site-packages' for {self._venv!s}")
|
183
|
-
|
184
|
-
def _locate_site_packages_linux_or_macos(self) -> Path:
|
185
|
-
lib = self._venv / "lib"
|
186
|
-
for dir_ in os.listdir(lib):
|
187
|
-
if dir_.startswith("python"):
|
188
|
-
packages = lib / dir_ / "site-packages"
|
189
|
-
if packages.exists():
|
190
|
-
return packages
|
191
|
-
raise ValueError(f"Could not locate 'site-packages' for {self._venv!s}")
|
192
|
-
|
193
|
-
def _install_with_pip(self) -> None:
|
194
|
-
with chdir(self._install_path):
|
195
|
-
# the way to call pip from python is highly sensitive to os and source type
|
196
|
-
if self._artifact:
|
197
|
-
self._install_local_artifact()
|
198
|
-
else:
|
199
|
-
self._install_remote_artifact()
|
200
|
-
|
201
|
-
def _install_local_artifact(self) -> None:
|
202
|
-
pip = self._locate_pip()
|
203
|
-
pip = pip.relative_to(self._install_path)
|
204
|
-
target = self._site_packages
|
205
|
-
target = target.relative_to(self._install_path)
|
206
|
-
if sys.platform == "win32":
|
207
|
-
command = f"{pip!s} install {self._artifact!s} -t {target!s}"
|
208
|
-
completed = run(command, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, shell=False, check=False)
|
226
|
+
venv_path = self._install_path / ".venv"
|
227
|
+
# Sadly, some platform-specific variations need to be dealt with:
|
228
|
+
# - Windows venvs do not use symlinks, but rather copies, when populating the venv.
|
229
|
+
# - The library path is different.
|
230
|
+
if use_symlinks := sys.platform != "win32":
|
231
|
+
major, minor = sys.version_info[:2]
|
232
|
+
lib_path = venv_path / "lib" / f"python{major}.{minor}" / "site-packages"
|
209
233
|
else:
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
pip = pip.relative_to(self._install_path)
|
218
|
-
target = self._site_packages
|
219
|
-
target = target.relative_to(self._install_path)
|
220
|
-
if sys.platform == "win32":
|
221
|
-
args = [str(pip), "install", self._pypi_name, "-t", str(target)]
|
222
|
-
completed = run(args, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, shell=False, check=False)
|
223
|
-
else:
|
224
|
-
command = f"'{pip!s}' install {self._pypi_name} -t '{target!s}'"
|
225
|
-
completed = run(command, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, shell=True, check=False)
|
226
|
-
# checking return code later makes debugging easier
|
227
|
-
completed.check_returncode()
|
234
|
+
lib_path = venv_path / "Lib" / "site-packages"
|
235
|
+
builder = venv.EnvBuilder(with_pip=True, prompt=f"{self._product_name}", symlinks=use_symlinks)
|
236
|
+
builder.create(venv_path)
|
237
|
+
context = builder.ensure_directories(venv_path)
|
238
|
+
logger.debug(f"Created virtual environment with context: {context}")
|
239
|
+
self._venv_exec_cmd = context.env_exec_cmd
|
240
|
+
self._site_packages = lib_path
|
228
241
|
|
229
|
-
def
|
230
|
-
|
242
|
+
def _install_with_pip(self) -> None:
|
243
|
+
# Based on: https://pip.pypa.io/en/stable/user_guide/#using-pip-from-your-program
|
244
|
+
# (But with venv_exec_cmd instead of sys.executable, so that we use the venv's pip.)
|
245
|
+
to_install: Path | str = self._artifact if self._artifact is not None else self._pypi_name
|
246
|
+
command: list[Path | str] = [
|
247
|
+
self._venv_exec_cmd,
|
248
|
+
"-m",
|
249
|
+
"pip",
|
250
|
+
"--disable-pip-version-check",
|
251
|
+
"install",
|
252
|
+
to_install,
|
253
|
+
]
|
254
|
+
result = run(command, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, check=False)
|
255
|
+
result.check_returncode()
|
231
256
|
|
232
257
|
def _copy_lsp_resources(self):
|
233
258
|
lsp = self._site_packages / "lsp"
|
@@ -235,21 +260,20 @@ class WheelInstaller(TranspilerInstaller):
|
|
235
260
|
raise ValueError("Installed transpiler is missing a 'lsp' folder")
|
236
261
|
shutil.copytree(lsp, self._install_path, dirs_exist_ok=True)
|
237
262
|
|
238
|
-
def _post_install(self
|
263
|
+
def _post_install(self) -> Path | None:
|
239
264
|
config = self._install_path / "config.yml"
|
240
265
|
if not config.exists():
|
241
266
|
raise ValueError("Installed transpiler is missing a 'config.yml' file in its 'lsp' folder")
|
242
267
|
install_ext = "ps1" if sys.platform == "win32" else "sh"
|
243
268
|
install_script = f"installer.{install_ext}"
|
244
|
-
|
245
|
-
if
|
246
|
-
self._run_custom_installer(
|
247
|
-
self._store_product_state(product_path=self._product_path, version=version)
|
269
|
+
installer_path = self._install_path / install_script
|
270
|
+
if installer_path.exists():
|
271
|
+
self._run_custom_installer(installer_path)
|
248
272
|
return self._install_path
|
249
273
|
|
250
|
-
def _run_custom_installer(self,
|
251
|
-
args = [
|
252
|
-
run(args, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, cwd=
|
274
|
+
def _run_custom_installer(self, installer_path: Path) -> None:
|
275
|
+
args = [installer_path]
|
276
|
+
run(args, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, cwd=self._install_path, check=True)
|
253
277
|
|
254
278
|
|
255
279
|
class MavenInstaller(TranspilerInstaller):
|
@@ -336,8 +360,7 @@ class MavenInstaller(TranspilerInstaller):
|
|
336
360
|
artifact_id: str,
|
337
361
|
artifact: Path | None = None,
|
338
362
|
) -> None:
|
339
|
-
super().__init__(repository)
|
340
|
-
self._product_name = product_name
|
363
|
+
super().__init__(repository, product_name)
|
341
364
|
self._group_id = group_id
|
342
365
|
self._artifact_id = artifact_id
|
343
366
|
self._artifact = artifact
|
@@ -358,40 +381,15 @@ class MavenInstaller(TranspilerInstaller):
|
|
358
381
|
if installed_version == latest_version:
|
359
382
|
logger.info(f"Databricks {self._product_name} transpiler v{latest_version} already installed")
|
360
383
|
return None
|
361
|
-
return self.
|
384
|
+
return self._install_version_with_backup(latest_version)
|
362
385
|
|
363
|
-
def _install_version(self, version: str) ->
|
364
|
-
logger.info(f"Installing Databricks {self._product_name} transpiler v{version}")
|
365
|
-
self._product_path = self._repository.transpilers_path() / self._product_name
|
366
|
-
backup_path = Path(f"{self._product_path!s}-saved")
|
367
|
-
if backup_path.exists():
|
368
|
-
rmtree(backup_path)
|
369
|
-
if self._product_path.exists():
|
370
|
-
os.rename(self._product_path, backup_path)
|
371
|
-
self._product_path.mkdir(parents=True)
|
372
|
-
self._install_path = self._product_path / "lib"
|
373
|
-
self._install_path.mkdir()
|
374
|
-
try:
|
375
|
-
if self._unsafe_install_version(version):
|
376
|
-
logger.info(f"Successfully installed {self._product_name} v{version}")
|
377
|
-
self._store_product_state(self._product_path, version)
|
378
|
-
if backup_path.exists():
|
379
|
-
rmtree(backup_path)
|
380
|
-
return self._product_path
|
381
|
-
except (KeyError, ValueError) as e:
|
382
|
-
logger.error(f"Failed to install Databricks {self._product_name} transpiler v{version}", exc_info=e)
|
383
|
-
rmtree(self._product_path)
|
384
|
-
if backup_path.exists():
|
385
|
-
os.rename(backup_path, self._product_path)
|
386
|
-
return None
|
387
|
-
|
388
|
-
def _unsafe_install_version(self, version: str) -> bool:
|
386
|
+
def _install_version(self, version: str) -> bool:
|
389
387
|
jar_file_path = self._install_path / f"{self._artifact_id}.jar"
|
390
388
|
if self._artifact:
|
391
|
-
logger.debug(f"Copying
|
389
|
+
logger.debug(f"Copying: {self._artifact} -> {jar_file_path}")
|
392
390
|
shutil.copyfile(self._artifact, jar_file_path)
|
393
391
|
elif not self.download_artifact_from_maven(self._group_id, self._artifact_id, version, jar_file_path):
|
394
|
-
logger.error(f"Failed to install Databricks {self._product_name} transpiler v{version}")
|
392
|
+
logger.error(f"Failed to install Databricks {self._product_name} transpiler (v{version})")
|
395
393
|
return False
|
396
394
|
self._copy_lsp_config(jar_file_path)
|
397
395
|
return True
|
@@ -449,6 +447,13 @@ class WorkspaceInstaller:
|
|
449
447
|
logger.info("Installation completed successfully! Please refer to the documentation for the next steps.")
|
450
448
|
return config
|
451
449
|
|
450
|
+
def has_installed_transpilers(self) -> bool:
|
451
|
+
"""Detect whether there are transpilers currently installed."""
|
452
|
+
installed_transpilers = self._transpiler_repository.all_transpiler_names()
|
453
|
+
if installed_transpilers:
|
454
|
+
logger.info(f"Detected installed transpilers: {sorted(installed_transpilers)}")
|
455
|
+
return bool(installed_transpilers)
|
456
|
+
|
452
457
|
def install_bladebridge(self, artifact: Path | None = None) -> None:
|
453
458
|
local_name = "bladebridge"
|
454
459
|
pypi_name = "databricks-bb-plugin"
|
@@ -802,3 +807,28 @@ class WorkspaceInstaller:
|
|
802
807
|
|
803
808
|
def _has_necessary_access(self, catalog_name: str, schema_name: str, volume_name: str | None = None):
|
804
809
|
self._resource_configurator.has_necessary_access(catalog_name, schema_name, volume_name)
|
810
|
+
|
811
|
+
|
812
|
+
def installer(ws: WorkspaceClient, transpiler_repository: TranspilerRepository) -> WorkspaceInstaller:
|
813
|
+
app_context = ApplicationContext(_verify_workspace_client(ws))
|
814
|
+
return WorkspaceInstaller(
|
815
|
+
app_context.workspace_client,
|
816
|
+
app_context.prompts,
|
817
|
+
app_context.installation,
|
818
|
+
app_context.install_state,
|
819
|
+
app_context.product_info,
|
820
|
+
app_context.resource_configurator,
|
821
|
+
app_context.workspace_installation,
|
822
|
+
transpiler_repository=transpiler_repository,
|
823
|
+
)
|
824
|
+
|
825
|
+
|
826
|
+
def _verify_workspace_client(ws: WorkspaceClient) -> WorkspaceClient:
|
827
|
+
"""Verifies the workspace client configuration, ensuring it has the correct product info."""
|
828
|
+
|
829
|
+
# Using reflection to set right value for _product_info for telemetry
|
830
|
+
product_info = getattr(ws.config, '_product_info')
|
831
|
+
if product_info[0] != "lakebridge":
|
832
|
+
setattr(ws.config, '_product_info', ('lakebridge', __version__))
|
833
|
+
|
834
|
+
return ws
|
@@ -3,6 +3,7 @@ from functools import reduce
|
|
3
3
|
from pyspark.sql import DataFrame, SparkSession
|
4
4
|
from pyspark.sql.functions import col, expr, lit
|
5
5
|
|
6
|
+
from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
|
6
7
|
from databricks.labs.lakebridge.reconcile.exception import ColumnMismatchException
|
7
8
|
from databricks.labs.lakebridge.reconcile.recon_capture import (
|
8
9
|
ReconIntermediatePersist,
|
@@ -22,7 +23,7 @@ _HASH_COLUMN_NAME = "hash_value_recon"
|
|
22
23
|
_SAMPLE_ROWS = 50
|
23
24
|
|
24
25
|
|
25
|
-
def
|
26
|
+
def _raise_column_mismatch_exception(msg: str, source_missing: list[str], target_missing: list[str]) -> Exception:
|
26
27
|
error_msg = (
|
27
28
|
f"{msg}\n"
|
28
29
|
f"columns missing in source: {','.join(source_missing) if source_missing else None}\n"
|
@@ -33,12 +34,25 @@ def raise_column_mismatch_exception(msg: str, source_missing: list[str], target_
|
|
33
34
|
|
34
35
|
def _generate_join_condition(source_alias, target_alias, key_columns):
|
35
36
|
conditions = [
|
36
|
-
col(f"{source_alias}.{key_column}").eqNullSafe(
|
37
|
+
col(f"{source_alias}.{DialectUtils.ansi_normalize_identifier(key_column)}").eqNullSafe(
|
38
|
+
col(f"{target_alias}.{DialectUtils.ansi_normalize_identifier(key_column)}")
|
39
|
+
)
|
37
40
|
for key_column in key_columns
|
38
41
|
]
|
39
42
|
return reduce(lambda a, b: a & b, conditions)
|
40
43
|
|
41
44
|
|
45
|
+
def _build_column_selector(table_name, column_name):
|
46
|
+
alias = DialectUtils.ansi_normalize_identifier(f"{table_name}_{DialectUtils.unnormalize_identifier(column_name)}")
|
47
|
+
return f'{table_name}.{DialectUtils.ansi_normalize_identifier(column_name)} as {alias}'
|
48
|
+
|
49
|
+
|
50
|
+
def _build_mismatch_column(table, column):
|
51
|
+
return col(DialectUtils.ansi_normalize_identifier(column)).alias(
|
52
|
+
DialectUtils.unnormalize_identifier(column.replace(f'{table}_', '').lower())
|
53
|
+
)
|
54
|
+
|
55
|
+
|
42
56
|
def reconcile_data(
|
43
57
|
source: DataFrame,
|
44
58
|
target: DataFrame,
|
@@ -59,14 +73,14 @@ def reconcile_data(
|
|
59
73
|
how="full",
|
60
74
|
)
|
61
75
|
.selectExpr(
|
62
|
-
*[f'{source_alias
|
63
|
-
*[f'{target_alias
|
76
|
+
*[f'{_build_column_selector(source_alias, col_name)}' for col_name in source.columns],
|
77
|
+
*[f'{_build_column_selector(target_alias, col_name)}' for col_name in target.columns],
|
64
78
|
)
|
65
79
|
)
|
66
80
|
|
67
81
|
# Write unmatched df to volume
|
68
82
|
df = ReconIntermediatePersist(spark, path).write_and_read_unmatched_df_with_volumes(df)
|
69
|
-
logger.warning(f"Unmatched data
|
83
|
+
logger.warning(f"Unmatched data was written to {path} successfully")
|
70
84
|
|
71
85
|
mismatch = _get_mismatch_data(df, source_alias, target_alias) if report_type in {"all", "data"} else None
|
72
86
|
|
@@ -74,24 +88,24 @@ def reconcile_data(
|
|
74
88
|
df.filter(col(f"{source_alias}_{_HASH_COLUMN_NAME}").isNull())
|
75
89
|
.select(
|
76
90
|
*[
|
77
|
-
|
91
|
+
_build_mismatch_column(target_alias, col_name)
|
78
92
|
for col_name in df.columns
|
79
93
|
if col_name.startswith(f'{target_alias}_')
|
80
94
|
]
|
81
95
|
)
|
82
|
-
.drop(_HASH_COLUMN_NAME)
|
96
|
+
.drop(f"{_HASH_COLUMN_NAME}")
|
83
97
|
)
|
84
98
|
|
85
99
|
missing_in_tgt = (
|
86
100
|
df.filter(col(f"{target_alias}_{_HASH_COLUMN_NAME}").isNull())
|
87
101
|
.select(
|
88
102
|
*[
|
89
|
-
|
103
|
+
_build_mismatch_column(source_alias, col_name)
|
90
104
|
for col_name in df.columns
|
91
105
|
if col_name.startswith(f'{source_alias}_')
|
92
106
|
]
|
93
107
|
)
|
94
|
-
.drop(_HASH_COLUMN_NAME)
|
108
|
+
.drop(f"{_HASH_COLUMN_NAME}")
|
95
109
|
)
|
96
110
|
mismatch_count = 0
|
97
111
|
if mismatch:
|
@@ -123,23 +137,27 @@ def _get_mismatch_data(df: DataFrame, src_alias: str, tgt_alias: str) -> DataFra
|
|
123
137
|
.filter(col("hash_match") == lit(False))
|
124
138
|
.select(
|
125
139
|
*[
|
126
|
-
|
140
|
+
_build_mismatch_column(src_alias, col_name)
|
127
141
|
for col_name in df.columns
|
128
142
|
if col_name.startswith(f'{src_alias}_')
|
129
143
|
]
|
130
144
|
)
|
131
|
-
.drop(_HASH_COLUMN_NAME)
|
145
|
+
.drop(f"{_HASH_COLUMN_NAME}")
|
132
146
|
)
|
133
147
|
|
134
148
|
|
135
|
-
def
|
136
|
-
|
137
|
-
|
149
|
+
def _build_capture_df(df: DataFrame) -> DataFrame:
|
150
|
+
columns = [
|
151
|
+
col(DialectUtils.ansi_normalize_identifier(column)).alias(DialectUtils.unnormalize_identifier(column))
|
152
|
+
for column in df.columns
|
153
|
+
]
|
154
|
+
return df.select(*columns)
|
138
155
|
|
139
156
|
|
140
157
|
def capture_mismatch_data_and_columns(source: DataFrame, target: DataFrame, key_columns: list[str]) -> MismatchOutput:
|
141
|
-
source_df =
|
142
|
-
target_df =
|
158
|
+
source_df = _build_capture_df(source)
|
159
|
+
target_df = _build_capture_df(target)
|
160
|
+
unnormalized_key_columns = [DialectUtils.unnormalize_identifier(column) for column in key_columns]
|
143
161
|
|
144
162
|
source_columns = source_df.columns
|
145
163
|
target_columns = target_df.columns
|
@@ -148,10 +166,10 @@ def capture_mismatch_data_and_columns(source: DataFrame, target: DataFrame, key_
|
|
148
166
|
message = "source and target should have same columns for capturing the mismatch data"
|
149
167
|
source_missing = [column for column in target_columns if column not in source_columns]
|
150
168
|
target_missing = [column for column in source_columns if column not in target_columns]
|
151
|
-
raise
|
169
|
+
raise _raise_column_mismatch_exception(message, source_missing, target_missing)
|
152
170
|
|
153
|
-
check_columns = [column for column in source_columns if column not in
|
154
|
-
mismatch_df = _get_mismatch_df(source_df, target_df,
|
171
|
+
check_columns = [column for column in source_columns if column not in unnormalized_key_columns]
|
172
|
+
mismatch_df = _get_mismatch_df(source_df, target_df, unnormalized_key_columns, check_columns)
|
155
173
|
mismatch_columns = _get_mismatch_columns(mismatch_df, check_columns)
|
156
174
|
return MismatchOutput(mismatch_df, mismatch_columns)
|
157
175
|
|
@@ -167,31 +185,50 @@ def _get_mismatch_columns(df: DataFrame, columns: list[str]):
|
|
167
185
|
return mismatch_columns
|
168
186
|
|
169
187
|
|
188
|
+
def _normalize_mismatch_df_col(column, suffix):
|
189
|
+
unnormalized = DialectUtils.unnormalize_identifier(column) + suffix
|
190
|
+
return DialectUtils.ansi_normalize_identifier(unnormalized)
|
191
|
+
|
192
|
+
|
193
|
+
def _unnormalize_mismatch_df_col(column, suffix):
|
194
|
+
unnormalized = DialectUtils.unnormalize_identifier(column) + suffix
|
195
|
+
return unnormalized
|
196
|
+
|
197
|
+
|
170
198
|
def _get_mismatch_df(source: DataFrame, target: DataFrame, key_columns: list[str], column_list: list[str]):
|
171
|
-
source_aliased = [
|
172
|
-
|
199
|
+
source_aliased = [
|
200
|
+
col('base.' + DialectUtils.ansi_normalize_identifier(column)).alias(
|
201
|
+
_unnormalize_mismatch_df_col(column, '_base')
|
202
|
+
)
|
203
|
+
for column in column_list
|
204
|
+
]
|
205
|
+
target_aliased = [
|
206
|
+
col('compare.' + DialectUtils.ansi_normalize_identifier(column)).alias(
|
207
|
+
_unnormalize_mismatch_df_col(column, '_compare')
|
208
|
+
)
|
209
|
+
for column in column_list
|
210
|
+
]
|
173
211
|
|
174
|
-
match_expr = [
|
175
|
-
|
212
|
+
match_expr = [
|
213
|
+
expr(f"{_normalize_mismatch_df_col(column,'_base')}=={_normalize_mismatch_df_col(column,'_compare')}").alias(
|
214
|
+
_unnormalize_mismatch_df_col(column, '_match')
|
215
|
+
)
|
216
|
+
for column in column_list
|
217
|
+
]
|
218
|
+
key_cols = [col(DialectUtils.ansi_normalize_identifier(column)) for column in key_columns]
|
176
219
|
select_expr = key_cols + source_aliased + target_aliased + match_expr
|
177
220
|
|
178
|
-
filter_columns = " and ".join([column + "_match" for column in column_list])
|
179
|
-
filter_expr = ~expr(filter_columns)
|
180
|
-
|
181
221
|
logger.info(f"KEY COLUMNS: {key_columns}")
|
182
|
-
logger.info(f"FILTER COLUMNS: {filter_expr}")
|
183
222
|
logger.info(f"SELECT COLUMNS: {select_expr}")
|
184
223
|
|
185
224
|
mismatch_df = (
|
186
225
|
source.alias('base').join(other=target.alias('compare'), on=key_columns, how="inner").select(*select_expr)
|
187
226
|
)
|
188
227
|
|
189
|
-
compare_columns = [
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
def alias_column_str(alias: str, columns: list[str]) -> list[str]:
|
194
|
-
return [f"{alias}.{column}" for column in columns]
|
228
|
+
compare_columns = [
|
229
|
+
DialectUtils.ansi_normalize_identifier(column) for column in mismatch_df.columns if column not in key_columns
|
230
|
+
]
|
231
|
+
return mismatch_df.select(*key_cols + sorted(compare_columns))
|
195
232
|
|
196
233
|
|
197
234
|
def _generate_agg_join_condition(source_alias: str, target_alias: str, key_columns: list[str]):
|