databricks-labs-lakebridge 0.10.7__py3-none-any.whl → 0.10.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- databricks/labs/lakebridge/__about__.py +1 -1
- databricks/labs/lakebridge/assessments/profiler_validator.py +103 -0
- databricks/labs/lakebridge/base_install.py +1 -5
- databricks/labs/lakebridge/cli.py +13 -6
- databricks/labs/lakebridge/helpers/validation.py +5 -3
- databricks/labs/lakebridge/install.py +40 -481
- databricks/labs/lakebridge/reconcile/connectors/data_source.py +9 -5
- databricks/labs/lakebridge/reconcile/connectors/databricks.py +2 -1
- databricks/labs/lakebridge/reconcile/connectors/oracle.py +2 -1
- databricks/labs/lakebridge/reconcile/connectors/secrets.py +19 -1
- databricks/labs/lakebridge/reconcile/connectors/snowflake.py +50 -29
- databricks/labs/lakebridge/reconcile/connectors/tsql.py +2 -1
- databricks/labs/lakebridge/reconcile/query_builder/base.py +50 -11
- databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +8 -2
- databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +7 -13
- databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +18 -19
- databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +36 -15
- databricks/labs/lakebridge/reconcile/recon_config.py +0 -15
- databricks/labs/lakebridge/reconcile/reconciliation.py +4 -1
- databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +11 -31
- databricks/labs/lakebridge/reconcile/trigger_recon_service.py +4 -1
- databricks/labs/lakebridge/transpiler/execute.py +34 -28
- databricks/labs/lakebridge/transpiler/installers.py +523 -0
- databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +2 -0
- {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/METADATA +1 -1
- {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/RECORD +30 -28
- {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/WHEEL +0 -0
- {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/entry_points.txt +0 -0
- {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/LICENSE +0 -0
- {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/NOTICE +0 -0
@@ -1,22 +1,10 @@
|
|
1
|
-
import re
|
2
|
-
import abc
|
3
1
|
import dataclasses
|
4
2
|
import logging
|
5
3
|
import os
|
6
|
-
import shutil
|
7
|
-
import sys
|
8
|
-
import venv
|
9
4
|
import webbrowser
|
10
|
-
|
11
|
-
from datetime import datetime, timezone
|
12
|
-
from json import loads, dump
|
5
|
+
from collections.abc import Set, Callable, Sequence
|
13
6
|
from pathlib import Path
|
14
|
-
from
|
15
|
-
from subprocess import run, CalledProcessError
|
16
|
-
from typing import Any, Literal, cast
|
17
|
-
from urllib import request
|
18
|
-
from urllib.error import URLError, HTTPError
|
19
|
-
from zipfile import ZipFile
|
7
|
+
from typing import Any, cast
|
20
8
|
|
21
9
|
from databricks.labs.blueprint.installation import Installation, JsonValue, SerdeError
|
22
10
|
from databricks.labs.blueprint.installer import InstallState
|
@@ -37,6 +25,11 @@ from databricks.labs.lakebridge.contexts.application import ApplicationContext
|
|
37
25
|
from databricks.labs.lakebridge.deployment.configurator import ResourceConfigurator
|
38
26
|
from databricks.labs.lakebridge.deployment.installation import WorkspaceInstallation
|
39
27
|
from databricks.labs.lakebridge.reconcile.constants import ReconReportType, ReconSourceType
|
28
|
+
from databricks.labs.lakebridge.transpiler.installers import (
|
29
|
+
BladebridgeInstaller,
|
30
|
+
MorpheusInstaller,
|
31
|
+
TranspilerInstaller,
|
32
|
+
)
|
40
33
|
from databricks.labs.lakebridge.transpiler.repository import TranspilerRepository
|
41
34
|
|
42
35
|
logger = logging.getLogger(__name__)
|
@@ -44,363 +37,6 @@ logger = logging.getLogger(__name__)
|
|
44
37
|
TRANSPILER_WAREHOUSE_PREFIX = "Lakebridge Transpiler Validation"
|
45
38
|
|
46
39
|
|
47
|
-
class _PathBackup:
|
48
|
-
"""A context manager to preserve a path before performing an operation, and optionally restore it afterwards."""
|
49
|
-
|
50
|
-
def __init__(self, path: Path) -> None:
|
51
|
-
self._path = path
|
52
|
-
self._backup_path: Path | None = None
|
53
|
-
self._finished = False
|
54
|
-
|
55
|
-
def __enter__(self) -> "_PathBackup":
|
56
|
-
self.start()
|
57
|
-
return self
|
58
|
-
|
59
|
-
def start(self) -> None:
|
60
|
-
"""Start the backup process by creating a backup of the path, if it already exists."""
|
61
|
-
backup_path = self._path.with_name(f"{self._path.name}-saved")
|
62
|
-
if backup_path.exists():
|
63
|
-
logger.debug(f"Existing backup found, removing: {backup_path}")
|
64
|
-
rmtree(backup_path)
|
65
|
-
if self._path.exists():
|
66
|
-
logger.debug(f"Backing up existing path: {self._path} -> {backup_path}")
|
67
|
-
os.rename(self._path, backup_path)
|
68
|
-
self._backup_path = backup_path
|
69
|
-
else:
|
70
|
-
self._backup_path = None
|
71
|
-
|
72
|
-
def rollback(self) -> None:
|
73
|
-
"""Rollback the operation by restoring the backup path, if it exists."""
|
74
|
-
assert not self._finished, "Can only rollback/commit once."
|
75
|
-
logger.debug(f"Removing path: {self._path}")
|
76
|
-
rmtree(self._path)
|
77
|
-
if self._backup_path is not None:
|
78
|
-
logger.debug(f"Restoring previous path: {self._backup_path} -> {self._path}")
|
79
|
-
os.rename(self._backup_path, self._path)
|
80
|
-
self._backup_path = None
|
81
|
-
self._finished = True
|
82
|
-
|
83
|
-
def commit(self) -> None:
|
84
|
-
"""Commit the operation by removing the backup path, if it exists."""
|
85
|
-
assert not self._finished, "Can only rollback/commit once."
|
86
|
-
if self._backup_path is not None:
|
87
|
-
logger.debug(f"Removing backup path: {self._backup_path}")
|
88
|
-
rmtree(self._backup_path)
|
89
|
-
self._backup_path = None
|
90
|
-
self._finished = True
|
91
|
-
|
92
|
-
def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]:
|
93
|
-
if not self._finished:
|
94
|
-
# Automatically commit or rollback based on whether an exception is underway.
|
95
|
-
if exc_val is None:
|
96
|
-
self.commit()
|
97
|
-
else:
|
98
|
-
self.rollback()
|
99
|
-
return False # Do not suppress any exception underway
|
100
|
-
|
101
|
-
|
102
|
-
class TranspilerInstaller(abc.ABC):
|
103
|
-
|
104
|
-
# TODO: Remove these properties when post-install is removed.
|
105
|
-
_install_path: Path
|
106
|
-
"""The path where the transpiler is being installed, once this starts."""
|
107
|
-
|
108
|
-
def __init__(self, repository: TranspilerRepository, product_name: str) -> None:
|
109
|
-
self._repository = repository
|
110
|
-
self._product_name = product_name
|
111
|
-
|
112
|
-
_version_pattern = re.compile(r"[_-](\d+(?:[.\-_]\w*\d+)+)")
|
113
|
-
|
114
|
-
@classmethod
|
115
|
-
def get_local_artifact_version(cls, artifact: Path) -> str | None:
|
116
|
-
# TODO: Get the version from the metadata inside the artifact rather than relying on the filename.
|
117
|
-
match = cls._version_pattern.search(artifact.stem)
|
118
|
-
if not match:
|
119
|
-
return None
|
120
|
-
group = match.group(0)
|
121
|
-
if not group:
|
122
|
-
return None
|
123
|
-
# TODO: Update the regex to take care of these trimming scenarios.
|
124
|
-
if group.startswith('-'):
|
125
|
-
group = group[1:]
|
126
|
-
if group.endswith("-py3"):
|
127
|
-
group = group[:-4]
|
128
|
-
return group
|
129
|
-
|
130
|
-
@classmethod
|
131
|
-
def _store_product_state(cls, product_path: Path, version: str) -> None:
|
132
|
-
state_path = product_path / "state"
|
133
|
-
state_path.mkdir()
|
134
|
-
version_data = {"version": f"v{version}", "date": datetime.now(timezone.utc).isoformat()}
|
135
|
-
version_path = state_path / "version.json"
|
136
|
-
with version_path.open("w", encoding="utf-8") as f:
|
137
|
-
dump(version_data, f)
|
138
|
-
f.write("\n")
|
139
|
-
|
140
|
-
def _install_version_with_backup(self, version: str) -> Path | None:
|
141
|
-
"""Install a specific version of the transpiler, with backup handling."""
|
142
|
-
logger.info(f"Installing Databricks {self._product_name} transpiler (v{version})")
|
143
|
-
product_path = self._repository.transpilers_path() / self._product_name
|
144
|
-
with _PathBackup(product_path) as backup:
|
145
|
-
self._install_path = product_path / "lib"
|
146
|
-
self._install_path.mkdir(parents=True, exist_ok=True)
|
147
|
-
try:
|
148
|
-
result = self._install_version(version)
|
149
|
-
except (CalledProcessError, KeyError, ValueError) as e:
|
150
|
-
# Warning: if you end up here under the IntelliJ/PyCharm debugger, it can be because the debugger is
|
151
|
-
# trying to inject itself into the subprocess. Try disabling:
|
152
|
-
# Settings | Build, Execution, Deployment | Python Debugger | Attach to subprocess automatically while debugging
|
153
|
-
# Note: Subprocess output is not captured, and should already be visible in the console.
|
154
|
-
logger.error(f"Failed to install {self._product_name} transpiler (v{version})", exc_info=e)
|
155
|
-
result = False
|
156
|
-
|
157
|
-
if result:
|
158
|
-
logger.info(f"Successfully installed {self._product_name} transpiler (v{version})")
|
159
|
-
self._store_product_state(product_path=product_path, version=version)
|
160
|
-
backup.commit()
|
161
|
-
return product_path
|
162
|
-
backup.rollback()
|
163
|
-
return None
|
164
|
-
|
165
|
-
@abc.abstractmethod
|
166
|
-
def _install_version(self, version: str) -> bool:
|
167
|
-
"""Install a specific version of the transpiler, returning True if successful."""
|
168
|
-
|
169
|
-
|
170
|
-
class WheelInstaller(TranspilerInstaller):
|
171
|
-
|
172
|
-
_venv_exec_cmd: Path
|
173
|
-
"""Once created, the command to run the virtual environment's Python executable."""
|
174
|
-
|
175
|
-
_site_packages: Path
|
176
|
-
"""Once created, the path to the site-packages directory in the virtual environment."""
|
177
|
-
|
178
|
-
@classmethod
|
179
|
-
def get_latest_artifact_version_from_pypi(cls, product_name: str) -> str | None:
|
180
|
-
try:
|
181
|
-
with request.urlopen(f"https://pypi.org/pypi/{product_name}/json") as server:
|
182
|
-
text: bytes = server.read()
|
183
|
-
data: dict[str, Any] = loads(text)
|
184
|
-
return data.get("info", {}).get('version', None)
|
185
|
-
except HTTPError as e:
|
186
|
-
logger.error(f"Error while fetching PyPI metadata: {product_name}", exc_info=e)
|
187
|
-
return None
|
188
|
-
|
189
|
-
def __init__(
|
190
|
-
self,
|
191
|
-
repository: TranspilerRepository,
|
192
|
-
product_name: str,
|
193
|
-
pypi_name: str,
|
194
|
-
artifact: Path | None = None,
|
195
|
-
) -> None:
|
196
|
-
super().__init__(repository, product_name)
|
197
|
-
self._pypi_name = pypi_name
|
198
|
-
self._artifact = artifact
|
199
|
-
|
200
|
-
def install(self) -> Path | None:
|
201
|
-
return self._install_checking_versions()
|
202
|
-
|
203
|
-
def _install_checking_versions(self) -> Path | None:
|
204
|
-
latest_version = (
|
205
|
-
self.get_local_artifact_version(self._artifact)
|
206
|
-
if self._artifact
|
207
|
-
else self.get_latest_artifact_version_from_pypi(self._pypi_name)
|
208
|
-
)
|
209
|
-
if latest_version is None:
|
210
|
-
logger.warning(f"Could not determine the latest version of {self._pypi_name}")
|
211
|
-
logger.error(f"Failed to install transpiler: {self._product_name}")
|
212
|
-
return None
|
213
|
-
installed_version = self._repository.get_installed_version(self._product_name)
|
214
|
-
if installed_version == latest_version:
|
215
|
-
logger.info(f"{self._pypi_name} v{latest_version} already installed")
|
216
|
-
return None
|
217
|
-
return self._install_version_with_backup(latest_version)
|
218
|
-
|
219
|
-
def _install_version(self, version: str) -> bool:
|
220
|
-
self._create_venv()
|
221
|
-
self._install_with_pip()
|
222
|
-
self._copy_lsp_resources()
|
223
|
-
return self._post_install() is not None
|
224
|
-
|
225
|
-
def _create_venv(self) -> None:
|
226
|
-
venv_path = self._install_path / ".venv"
|
227
|
-
# Sadly, some platform-specific variations need to be dealt with:
|
228
|
-
# - Windows venvs do not use symlinks, but rather copies, when populating the venv.
|
229
|
-
# - The library path is different.
|
230
|
-
if use_symlinks := sys.platform != "win32":
|
231
|
-
major, minor = sys.version_info[:2]
|
232
|
-
lib_path = venv_path / "lib" / f"python{major}.{minor}" / "site-packages"
|
233
|
-
else:
|
234
|
-
lib_path = venv_path / "Lib" / "site-packages"
|
235
|
-
builder = venv.EnvBuilder(with_pip=True, prompt=f"{self._product_name}", symlinks=use_symlinks)
|
236
|
-
builder.create(venv_path)
|
237
|
-
context = builder.ensure_directories(venv_path)
|
238
|
-
logger.debug(f"Created virtual environment with context: {context}")
|
239
|
-
self._venv_exec_cmd = context.env_exec_cmd
|
240
|
-
self._site_packages = lib_path
|
241
|
-
|
242
|
-
def _install_with_pip(self) -> None:
|
243
|
-
# Based on: https://pip.pypa.io/en/stable/user_guide/#using-pip-from-your-program
|
244
|
-
# (But with venv_exec_cmd instead of sys.executable, so that we use the venv's pip.)
|
245
|
-
to_install: Path | str = self._artifact if self._artifact is not None else self._pypi_name
|
246
|
-
command: list[Path | str] = [
|
247
|
-
self._venv_exec_cmd,
|
248
|
-
"-m",
|
249
|
-
"pip",
|
250
|
-
"--disable-pip-version-check",
|
251
|
-
"install",
|
252
|
-
to_install,
|
253
|
-
]
|
254
|
-
result = run(command, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, check=False)
|
255
|
-
result.check_returncode()
|
256
|
-
|
257
|
-
def _copy_lsp_resources(self):
|
258
|
-
lsp = self._site_packages / "lsp"
|
259
|
-
if not lsp.exists():
|
260
|
-
raise ValueError("Installed transpiler is missing a 'lsp' folder")
|
261
|
-
shutil.copytree(lsp, self._install_path, dirs_exist_ok=True)
|
262
|
-
|
263
|
-
def _post_install(self) -> Path | None:
|
264
|
-
config = self._install_path / "config.yml"
|
265
|
-
if not config.exists():
|
266
|
-
raise ValueError("Installed transpiler is missing a 'config.yml' file in its 'lsp' folder")
|
267
|
-
install_ext = "ps1" if sys.platform == "win32" else "sh"
|
268
|
-
install_script = f"installer.{install_ext}"
|
269
|
-
installer_path = self._install_path / install_script
|
270
|
-
if installer_path.exists():
|
271
|
-
self._run_custom_installer(installer_path)
|
272
|
-
return self._install_path
|
273
|
-
|
274
|
-
def _run_custom_installer(self, installer_path: Path) -> None:
|
275
|
-
args = [installer_path]
|
276
|
-
run(args, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, cwd=self._install_path, check=True)
|
277
|
-
|
278
|
-
|
279
|
-
class MavenInstaller(TranspilerInstaller):
|
280
|
-
# Maven Central, base URL.
|
281
|
-
_maven_central_repo: str = "https://repo.maven.apache.org/maven2/"
|
282
|
-
|
283
|
-
@classmethod
|
284
|
-
def _artifact_base_url(cls, group_id: str, artifact_id: str) -> str:
|
285
|
-
"""Construct the base URL for a Maven artifact."""
|
286
|
-
# Reference: https://maven.apache.org/repositories/layout.html
|
287
|
-
group_path = group_id.replace(".", "/")
|
288
|
-
return f"{cls._maven_central_repo}{group_path}/{artifact_id}/"
|
289
|
-
|
290
|
-
@classmethod
|
291
|
-
def artifact_metadata_url(cls, group_id: str, artifact_id: str) -> str:
|
292
|
-
"""Get the metadata URL for a Maven artifact."""
|
293
|
-
# TODO: Unit test this method.
|
294
|
-
return f"{cls._artifact_base_url(group_id, artifact_id)}maven-metadata.xml"
|
295
|
-
|
296
|
-
@classmethod
|
297
|
-
def artifact_url(
|
298
|
-
cls, group_id: str, artifact_id: str, version: str, classifier: str | None = None, extension: str = "jar"
|
299
|
-
) -> str:
|
300
|
-
"""Get the URL for a versioned Maven artifact."""
|
301
|
-
# TODO: Unit test this method, including classifier and extension.
|
302
|
-
_classifier = f"-{classifier}" if classifier else ""
|
303
|
-
artifact_base_url = cls._artifact_base_url(group_id, artifact_id)
|
304
|
-
return f"{artifact_base_url}{version}/{artifact_id}-{version}{_classifier}.{extension}"
|
305
|
-
|
306
|
-
@classmethod
|
307
|
-
def get_current_maven_artifact_version(cls, group_id: str, artifact_id: str) -> str | None:
|
308
|
-
url = cls.artifact_metadata_url(group_id, artifact_id)
|
309
|
-
try:
|
310
|
-
with request.urlopen(url) as server:
|
311
|
-
text = server.read()
|
312
|
-
except HTTPError as e:
|
313
|
-
logger.error(f"Error while fetching maven metadata: {group_id}:{artifact_id}", exc_info=e)
|
314
|
-
return None
|
315
|
-
logger.debug(f"Maven metadata for {group_id}:{artifact_id}: {text}")
|
316
|
-
return cls._extract_latest_release_version(text)
|
317
|
-
|
318
|
-
@classmethod
|
319
|
-
def _extract_latest_release_version(cls, maven_metadata: str) -> str | None:
|
320
|
-
"""Extract the latest release version from Maven metadata."""
|
321
|
-
# Reference: https://maven.apache.org/repositories/metadata.html#The_A_Level_Metadata
|
322
|
-
# TODO: Unit test this method, to verify the sequence of things it checks for.
|
323
|
-
root = ET.fromstring(maven_metadata)
|
324
|
-
for label in ("release", "latest"):
|
325
|
-
version = root.findtext(f"./versioning/{label}")
|
326
|
-
if version is not None:
|
327
|
-
return version
|
328
|
-
return root.findtext("./versioning/versions/version[last()]")
|
329
|
-
|
330
|
-
@classmethod
|
331
|
-
def download_artifact_from_maven(
|
332
|
-
cls,
|
333
|
-
group_id: str,
|
334
|
-
artifact_id: str,
|
335
|
-
version: str,
|
336
|
-
target: Path,
|
337
|
-
classifier: str | None = None,
|
338
|
-
extension: str = "jar",
|
339
|
-
) -> bool:
|
340
|
-
if target.exists():
|
341
|
-
logger.warning(f"Skipping download of {group_id}:{artifact_id}:{version}; target already exists: {target}")
|
342
|
-
return True
|
343
|
-
url = cls.artifact_url(group_id, artifact_id, version, classifier, extension)
|
344
|
-
try:
|
345
|
-
path, _ = request.urlretrieve(url)
|
346
|
-
logger.debug(f"Downloaded maven artefact from {url} to {path}")
|
347
|
-
except URLError as e:
|
348
|
-
logger.error(f"Unable to download maven artefact: {group_id}:{artifact_id}:{version}", exc_info=e)
|
349
|
-
return False
|
350
|
-
logger.debug(f"Moving {path} to {target}")
|
351
|
-
move(path, target)
|
352
|
-
logger.info(f"Successfully installed: {group_id}:{artifact_id}:{version}")
|
353
|
-
return True
|
354
|
-
|
355
|
-
def __init__(
|
356
|
-
self,
|
357
|
-
repository: TranspilerRepository,
|
358
|
-
product_name: str,
|
359
|
-
group_id: str,
|
360
|
-
artifact_id: str,
|
361
|
-
artifact: Path | None = None,
|
362
|
-
) -> None:
|
363
|
-
super().__init__(repository, product_name)
|
364
|
-
self._group_id = group_id
|
365
|
-
self._artifact_id = artifact_id
|
366
|
-
self._artifact = artifact
|
367
|
-
|
368
|
-
def install(self) -> Path | None:
|
369
|
-
return self._install_checking_versions()
|
370
|
-
|
371
|
-
def _install_checking_versions(self) -> Path | None:
|
372
|
-
if self._artifact:
|
373
|
-
latest_version = self.get_local_artifact_version(self._artifact)
|
374
|
-
else:
|
375
|
-
latest_version = self.get_current_maven_artifact_version(self._group_id, self._artifact_id)
|
376
|
-
if latest_version is None:
|
377
|
-
logger.warning(f"Could not determine the latest version of Databricks {self._product_name} transpiler")
|
378
|
-
logger.error("Failed to install transpiler: Databricks {self._product_name} transpiler")
|
379
|
-
return None
|
380
|
-
installed_version = self._repository.get_installed_version(self._product_name)
|
381
|
-
if installed_version == latest_version:
|
382
|
-
logger.info(f"Databricks {self._product_name} transpiler v{latest_version} already installed")
|
383
|
-
return None
|
384
|
-
return self._install_version_with_backup(latest_version)
|
385
|
-
|
386
|
-
def _install_version(self, version: str) -> bool:
|
387
|
-
jar_file_path = self._install_path / f"{self._artifact_id}.jar"
|
388
|
-
if self._artifact:
|
389
|
-
logger.debug(f"Copying: {self._artifact} -> {jar_file_path}")
|
390
|
-
shutil.copyfile(self._artifact, jar_file_path)
|
391
|
-
elif not self.download_artifact_from_maven(self._group_id, self._artifact_id, version, jar_file_path):
|
392
|
-
logger.error(f"Failed to install Databricks {self._product_name} transpiler (v{version})")
|
393
|
-
return False
|
394
|
-
self._copy_lsp_config(jar_file_path)
|
395
|
-
return True
|
396
|
-
|
397
|
-
def _copy_lsp_config(self, jar_file_path: Path) -> None:
|
398
|
-
with ZipFile(jar_file_path) as zip_file:
|
399
|
-
zip_file.extract("lsp/config.yml", self._install_path)
|
400
|
-
shutil.move(self._install_path / "lsp" / "config.yml", self._install_path / "config.yml")
|
401
|
-
os.rmdir(self._install_path / "lsp")
|
402
|
-
|
403
|
-
|
404
40
|
class WorkspaceInstaller:
|
405
41
|
def __init__(
|
406
42
|
self,
|
@@ -412,7 +48,12 @@ class WorkspaceInstaller:
|
|
412
48
|
resource_configurator: ResourceConfigurator,
|
413
49
|
workspace_installation: WorkspaceInstallation,
|
414
50
|
environ: dict[str, str] | None = None,
|
51
|
+
*,
|
415
52
|
transpiler_repository: TranspilerRepository = TranspilerRepository.user_home(),
|
53
|
+
transpiler_installers: Sequence[Callable[[TranspilerRepository], TranspilerInstaller]] = (
|
54
|
+
BladebridgeInstaller,
|
55
|
+
MorpheusInstaller,
|
56
|
+
),
|
416
57
|
):
|
417
58
|
self._ws = ws
|
418
59
|
self._prompts = prompts
|
@@ -422,6 +63,7 @@ class WorkspaceInstaller:
|
|
422
63
|
self._resource_configurator = resource_configurator
|
423
64
|
self._ws_installation = workspace_installation
|
424
65
|
self._transpiler_repository = transpiler_repository
|
66
|
+
self._transpiler_installer_factories = transpiler_installers
|
425
67
|
|
426
68
|
if not environ:
|
427
69
|
environ = dict(os.environ.items())
|
@@ -430,15 +72,19 @@ class WorkspaceInstaller:
|
|
430
72
|
msg = "WorkspaceInstaller is not supposed to be executed in Databricks Runtime"
|
431
73
|
raise SystemExit(msg)
|
432
74
|
|
75
|
+
@property
|
76
|
+
def _transpiler_installers(self) -> Set[TranspilerInstaller]:
|
77
|
+
return frozenset(factory(self._transpiler_repository) for factory in self._transpiler_installer_factories)
|
78
|
+
|
433
79
|
def run(
|
434
80
|
self, module: str, config: LakebridgeConfiguration | None = None, artifact: str | None = None
|
435
81
|
) -> LakebridgeConfiguration:
|
436
82
|
logger.debug(f"Initializing workspace installation for module: {module} (config: {config})")
|
437
83
|
if module == "transpile" and artifact:
|
438
|
-
self.
|
84
|
+
self._install_artifact(artifact)
|
439
85
|
elif module in {"transpile", "all"}:
|
440
|
-
self.
|
441
|
-
|
86
|
+
for transpiler_installer in self._transpiler_installers:
|
87
|
+
transpiler_installer.install()
|
442
88
|
if not config:
|
443
89
|
config = self.configure(module)
|
444
90
|
if self._is_testing():
|
@@ -447,123 +93,36 @@ class WorkspaceInstaller:
|
|
447
93
|
logger.info("Installation completed successfully! Please refer to the documentation for the next steps.")
|
448
94
|
return config
|
449
95
|
|
450
|
-
def
|
451
|
-
"""Detect
|
96
|
+
def upgrade_installed_transpilers(self) -> bool:
|
97
|
+
"""Detect and upgrade, if possible and necessary, installed transpilers."""
|
452
98
|
installed_transpilers = self._transpiler_repository.all_transpiler_names()
|
453
99
|
if installed_transpilers:
|
454
100
|
logger.info(f"Detected installed transpilers: {sorted(installed_transpilers)}")
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
product_name = "databricks-morph-plugin"
|
470
|
-
group_id = "com.databricks.labs"
|
471
|
-
artifact_id = product_name
|
472
|
-
maven_installer = MavenInstaller(self._transpiler_repository, product_name, group_id, artifact_id, artifact)
|
473
|
-
maven_installer.install()
|
474
|
-
|
475
|
-
@classmethod
|
476
|
-
def is_java_version_okay(cls) -> bool:
|
477
|
-
detected_java = cls.find_java()
|
478
|
-
match detected_java:
|
479
|
-
case None:
|
480
|
-
logger.warning("No Java executable found in the system PATH.")
|
481
|
-
return False
|
482
|
-
case (java_executable, None):
|
483
|
-
logger.warning(f"Java found, but could not determine the version: {java_executable}.")
|
484
|
-
return False
|
485
|
-
case (java_executable, bytes(raw_version)):
|
486
|
-
logger.warning(f"Java found ({java_executable}), but could not parse the version:\n{raw_version}")
|
487
|
-
return False
|
488
|
-
case (java_executable, tuple(old_version)) if old_version < (11, 0, 0, 0):
|
489
|
-
version_str = ".".join(str(v) for v in old_version)
|
490
|
-
logger.warning(f"Java found ({java_executable}), but version {version_str} is too old.")
|
491
|
-
return False
|
492
|
-
case _:
|
493
|
-
return True
|
494
|
-
|
495
|
-
def install_artifact(self, artifact: str):
|
101
|
+
upgraded = False
|
102
|
+
for transpiler_installer in self._transpiler_installers:
|
103
|
+
name = transpiler_installer.name
|
104
|
+
if name in installed_transpilers:
|
105
|
+
logger.info(f"Checking for {name} upgrades...")
|
106
|
+
upgraded |= transpiler_installer.install()
|
107
|
+
# If we upgraded anything, the configuration process needs to run again.
|
108
|
+
if upgraded:
|
109
|
+
config = self.configure("transpile")
|
110
|
+
if not self._is_testing():
|
111
|
+
self._ws_installation.install(config)
|
112
|
+
return upgraded
|
113
|
+
|
114
|
+
def _install_artifact(self, artifact: str) -> None:
|
496
115
|
path = Path(artifact)
|
497
116
|
if not path.exists():
|
498
117
|
logger.error(f"Could not locate artifact {artifact}")
|
499
118
|
return
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
119
|
+
for transpiler_installer in self._transpiler_installers:
|
120
|
+
if transpiler_installer.can_install(path):
|
121
|
+
transpiler_installer.install(path)
|
122
|
+
break
|
504
123
|
else:
|
505
124
|
logger.fatal(f"Cannot install unsupported artifact: {artifact}")
|
506
125
|
|
507
|
-
@classmethod
|
508
|
-
def find_java(cls) -> tuple[Path, tuple[int, int, int, int] | bytes | None] | None:
|
509
|
-
"""Locate Java and return its version, as reported by `java -version`.
|
510
|
-
|
511
|
-
The java executable is currently located by searching the system PATH. Its version is parsed from the output of
|
512
|
-
the `java -version` command, which has been standardized since Java 10.
|
513
|
-
|
514
|
-
Returns:
|
515
|
-
a tuple of its path and the version as a tuple of integers (feature, interim, update, patch), if the java
|
516
|
-
executable could be located. If the version cannot be parsed, instead the raw version information is
|
517
|
-
returned, or `None` as a last resort. When no java executable is found, `None` is returned instead of a
|
518
|
-
tuple.
|
519
|
-
"""
|
520
|
-
# Platform-independent way to reliably locate the java executable.
|
521
|
-
# Reference: https://docs.python.org/3.10/library/subprocess.html#popen-constructor
|
522
|
-
java_executable = shutil.which("java")
|
523
|
-
if java_executable is None:
|
524
|
-
return None
|
525
|
-
java_executable_path = Path(java_executable)
|
526
|
-
logger.debug(f"Using java executable: {java_executable_path!r}")
|
527
|
-
try:
|
528
|
-
completed = run([str(java_executable_path), "-version"], shell=False, capture_output=True, check=True)
|
529
|
-
except CalledProcessError as e:
|
530
|
-
logger.debug(
|
531
|
-
f"Failed to run {e.args!r} (exit-code={e.returncode}, stdout={e.stdout!r}, stderr={e.stderr!r})",
|
532
|
-
exc_info=e,
|
533
|
-
)
|
534
|
-
return java_executable_path, None
|
535
|
-
# It might not be ascii, but the bits we care about are so this will never fail.
|
536
|
-
raw_output = completed.stderr
|
537
|
-
java_version_output = raw_output.decode("ascii", errors="ignore")
|
538
|
-
java_version = cls._parse_java_version(java_version_output)
|
539
|
-
if java_version is None:
|
540
|
-
return java_executable_path, raw_output.strip()
|
541
|
-
logger.debug(f"Detected java version: {java_version}")
|
542
|
-
return java_executable_path, java_version
|
543
|
-
|
544
|
-
# Pattern to match a Java version string, compiled at import time to ensure it's valid.
|
545
|
-
# Ref: https://docs.oracle.com/en/java/javase/11/install/version-string-format.html
|
546
|
-
_java_version_pattern = re.compile(
|
547
|
-
r' version "(?P<feature>\d+)(?:\.(?P<interim>\d+)(?:\.(?P<update>\d+)(?:\.(?P<patch>\d+))?)?)?"'
|
548
|
-
)
|
549
|
-
|
550
|
-
@classmethod
|
551
|
-
def _parse_java_version(cls, version: str) -> tuple[int, int, int, int] | None:
|
552
|
-
"""Locate and parse the Java version in the output of `java -version`."""
|
553
|
-
# Output looks like this:
|
554
|
-
# openjdk version "24.0.1" 2025-04-15
|
555
|
-
# OpenJDK Runtime Environment Temurin-24.0.1+9 (build 24.0.1+9)
|
556
|
-
# OpenJDK 64-Bit Server VM Temurin-24.0.1+9 (build 24.0.1+9, mixed mode)
|
557
|
-
match = cls._java_version_pattern.search(version)
|
558
|
-
if not match:
|
559
|
-
logger.debug(f"Could not parse java version: {version!r}")
|
560
|
-
return None
|
561
|
-
feature = int(match["feature"])
|
562
|
-
interim = int(match["interim"] or 0)
|
563
|
-
update = int(match["update"] or 0)
|
564
|
-
patch = int(match["patch"] or 0)
|
565
|
-
return feature, interim, update, patch
|
566
|
-
|
567
126
|
def configure(self, module: str) -> LakebridgeConfiguration:
|
568
127
|
match module:
|
569
128
|
case "transpile":
|
@@ -29,6 +29,7 @@ class DataSource(ABC):
|
|
29
29
|
catalog: str | None,
|
30
30
|
schema: str,
|
31
31
|
table: str,
|
32
|
+
normalize: bool = True,
|
32
33
|
) -> list[Schema]:
|
33
34
|
return NotImplemented
|
34
35
|
|
@@ -42,16 +43,19 @@ class DataSource(ABC):
|
|
42
43
|
logger.warning(error_msg)
|
43
44
|
raise DataSourceRuntimeException(error_msg) from exception
|
44
45
|
|
45
|
-
def _map_meta_column(self, meta_column) -> Schema:
|
46
|
+
def _map_meta_column(self, meta_column, normalize: bool) -> Schema:
|
46
47
|
"""Create a normalized Schema DTO from the database metadata
|
47
48
|
|
48
49
|
Used in the implementations of get_schema to build a Schema DTO from the `INFORMATION_SCHEMA` query result.
|
49
50
|
The returned Schema is normalized in case the database is having columns with special characters and standardize
|
50
51
|
"""
|
51
|
-
name = meta_column.col_name
|
52
|
+
name = meta_column.col_name.lower()
|
52
53
|
dtype = meta_column.data_type.strip().lower()
|
53
|
-
|
54
|
-
|
54
|
+
if normalize:
|
55
|
+
normalized = self.normalize_identifier(name)
|
56
|
+
return Schema(normalized.ansi_normalized, dtype, normalized.ansi_normalized, normalized.source_normalized)
|
57
|
+
|
58
|
+
return Schema(name, dtype, name, name)
|
55
59
|
|
56
60
|
|
57
61
|
class MockDataSource(DataSource):
|
@@ -80,7 +84,7 @@ class MockDataSource(DataSource):
|
|
80
84
|
return self.log_and_throw_exception(self._exception, "data", f"({catalog}, {schema}, {query})")
|
81
85
|
return mock_df
|
82
86
|
|
83
|
-
def get_schema(self, catalog: str | None, schema: str, table: str) -> list[Schema]:
|
87
|
+
def get_schema(self, catalog: str | None, schema: str, table: str, normalize: bool = True) -> list[Schema]:
|
84
88
|
catalog_str = catalog if catalog else ""
|
85
89
|
mock_schema = self._schema_repository.get((catalog_str, schema, table))
|
86
90
|
if not mock_schema:
|
@@ -77,6 +77,7 @@ class DatabricksDataSource(DataSource, SecretsMixin):
|
|
77
77
|
catalog: str | None,
|
78
78
|
schema: str,
|
79
79
|
table: str,
|
80
|
+
normalize: bool = True,
|
80
81
|
) -> list[Schema]:
|
81
82
|
catalog_str = catalog if catalog else "hive_metastore"
|
82
83
|
schema_query = _get_schema_query(catalog_str, schema, table)
|
@@ -85,7 +86,7 @@ class DatabricksDataSource(DataSource, SecretsMixin):
|
|
85
86
|
logger.info(f"Fetching Schema: Started at: {datetime.now()}")
|
86
87
|
schema_metadata = self._spark.sql(schema_query).where("col_name not like '#%'").distinct().collect()
|
87
88
|
logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
|
88
|
-
return [self._map_meta_column(field) for field in schema_metadata]
|
89
|
+
return [self._map_meta_column(field, normalize) for field in schema_metadata]
|
89
90
|
except (RuntimeError, PySparkException) as e:
|
90
91
|
return self.log_and_throw_exception(e, "schema", schema_query)
|
91
92
|
|
@@ -81,6 +81,7 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
|
81
81
|
catalog: str | None,
|
82
82
|
schema: str,
|
83
83
|
table: str,
|
84
|
+
normalize: bool = True,
|
84
85
|
) -> list[Schema]:
|
85
86
|
schema_query = re.sub(
|
86
87
|
r'\s+',
|
@@ -94,7 +95,7 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
|
|
94
95
|
schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
|
95
96
|
logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
|
96
97
|
logger.debug(f"schema_metadata: ${schema_metadata}")
|
97
|
-
return [self._map_meta_column(field) for field in schema_metadata]
|
98
|
+
return [self._map_meta_column(field, normalize) for field in schema_metadata]
|
98
99
|
except (RuntimeError, PySparkException) as e:
|
99
100
|
return self.log_and_throw_exception(e, "schema", schema_query)
|
100
101
|
|