databricks-labs-lakebridge 0.10.7__py3-none-any.whl → 0.10.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. databricks/labs/lakebridge/__about__.py +1 -1
  2. databricks/labs/lakebridge/assessments/profiler_validator.py +103 -0
  3. databricks/labs/lakebridge/base_install.py +1 -5
  4. databricks/labs/lakebridge/cli.py +13 -6
  5. databricks/labs/lakebridge/helpers/validation.py +5 -3
  6. databricks/labs/lakebridge/install.py +40 -481
  7. databricks/labs/lakebridge/reconcile/connectors/data_source.py +9 -5
  8. databricks/labs/lakebridge/reconcile/connectors/databricks.py +2 -1
  9. databricks/labs/lakebridge/reconcile/connectors/oracle.py +2 -1
  10. databricks/labs/lakebridge/reconcile/connectors/secrets.py +19 -1
  11. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +50 -29
  12. databricks/labs/lakebridge/reconcile/connectors/tsql.py +2 -1
  13. databricks/labs/lakebridge/reconcile/query_builder/base.py +50 -11
  14. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +8 -2
  15. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +7 -13
  16. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +18 -19
  17. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +36 -15
  18. databricks/labs/lakebridge/reconcile/recon_config.py +0 -15
  19. databricks/labs/lakebridge/reconcile/reconciliation.py +4 -1
  20. databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +11 -31
  21. databricks/labs/lakebridge/reconcile/trigger_recon_service.py +4 -1
  22. databricks/labs/lakebridge/transpiler/execute.py +34 -28
  23. databricks/labs/lakebridge/transpiler/installers.py +523 -0
  24. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +2 -0
  25. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/METADATA +1 -1
  26. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/RECORD +30 -28
  27. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/WHEEL +0 -0
  28. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/entry_points.txt +0 -0
  29. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/LICENSE +0 -0
  30. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/NOTICE +0 -0
@@ -1,22 +1,10 @@
1
- import re
2
- import abc
3
1
  import dataclasses
4
2
  import logging
5
3
  import os
6
- import shutil
7
- import sys
8
- import venv
9
4
  import webbrowser
10
- import xml.etree.ElementTree as ET
11
- from datetime import datetime, timezone
12
- from json import loads, dump
5
+ from collections.abc import Set, Callable, Sequence
13
6
  from pathlib import Path
14
- from shutil import rmtree, move
15
- from subprocess import run, CalledProcessError
16
- from typing import Any, Literal, cast
17
- from urllib import request
18
- from urllib.error import URLError, HTTPError
19
- from zipfile import ZipFile
7
+ from typing import Any, cast
20
8
 
21
9
  from databricks.labs.blueprint.installation import Installation, JsonValue, SerdeError
22
10
  from databricks.labs.blueprint.installer import InstallState
@@ -37,6 +25,11 @@ from databricks.labs.lakebridge.contexts.application import ApplicationContext
37
25
  from databricks.labs.lakebridge.deployment.configurator import ResourceConfigurator
38
26
  from databricks.labs.lakebridge.deployment.installation import WorkspaceInstallation
39
27
  from databricks.labs.lakebridge.reconcile.constants import ReconReportType, ReconSourceType
28
+ from databricks.labs.lakebridge.transpiler.installers import (
29
+ BladebridgeInstaller,
30
+ MorpheusInstaller,
31
+ TranspilerInstaller,
32
+ )
40
33
  from databricks.labs.lakebridge.transpiler.repository import TranspilerRepository
41
34
 
42
35
  logger = logging.getLogger(__name__)
@@ -44,363 +37,6 @@ logger = logging.getLogger(__name__)
44
37
  TRANSPILER_WAREHOUSE_PREFIX = "Lakebridge Transpiler Validation"
45
38
 
46
39
 
47
- class _PathBackup:
48
- """A context manager to preserve a path before performing an operation, and optionally restore it afterwards."""
49
-
50
- def __init__(self, path: Path) -> None:
51
- self._path = path
52
- self._backup_path: Path | None = None
53
- self._finished = False
54
-
55
- def __enter__(self) -> "_PathBackup":
56
- self.start()
57
- return self
58
-
59
- def start(self) -> None:
60
- """Start the backup process by creating a backup of the path, if it already exists."""
61
- backup_path = self._path.with_name(f"{self._path.name}-saved")
62
- if backup_path.exists():
63
- logger.debug(f"Existing backup found, removing: {backup_path}")
64
- rmtree(backup_path)
65
- if self._path.exists():
66
- logger.debug(f"Backing up existing path: {self._path} -> {backup_path}")
67
- os.rename(self._path, backup_path)
68
- self._backup_path = backup_path
69
- else:
70
- self._backup_path = None
71
-
72
- def rollback(self) -> None:
73
- """Rollback the operation by restoring the backup path, if it exists."""
74
- assert not self._finished, "Can only rollback/commit once."
75
- logger.debug(f"Removing path: {self._path}")
76
- rmtree(self._path)
77
- if self._backup_path is not None:
78
- logger.debug(f"Restoring previous path: {self._backup_path} -> {self._path}")
79
- os.rename(self._backup_path, self._path)
80
- self._backup_path = None
81
- self._finished = True
82
-
83
- def commit(self) -> None:
84
- """Commit the operation by removing the backup path, if it exists."""
85
- assert not self._finished, "Can only rollback/commit once."
86
- if self._backup_path is not None:
87
- logger.debug(f"Removing backup path: {self._backup_path}")
88
- rmtree(self._backup_path)
89
- self._backup_path = None
90
- self._finished = True
91
-
92
- def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]:
93
- if not self._finished:
94
- # Automatically commit or rollback based on whether an exception is underway.
95
- if exc_val is None:
96
- self.commit()
97
- else:
98
- self.rollback()
99
- return False # Do not suppress any exception underway
100
-
101
-
102
- class TranspilerInstaller(abc.ABC):
103
-
104
- # TODO: Remove these properties when post-install is removed.
105
- _install_path: Path
106
- """The path where the transpiler is being installed, once this starts."""
107
-
108
- def __init__(self, repository: TranspilerRepository, product_name: str) -> None:
109
- self._repository = repository
110
- self._product_name = product_name
111
-
112
- _version_pattern = re.compile(r"[_-](\d+(?:[.\-_]\w*\d+)+)")
113
-
114
- @classmethod
115
- def get_local_artifact_version(cls, artifact: Path) -> str | None:
116
- # TODO: Get the version from the metadata inside the artifact rather than relying on the filename.
117
- match = cls._version_pattern.search(artifact.stem)
118
- if not match:
119
- return None
120
- group = match.group(0)
121
- if not group:
122
- return None
123
- # TODO: Update the regex to take care of these trimming scenarios.
124
- if group.startswith('-'):
125
- group = group[1:]
126
- if group.endswith("-py3"):
127
- group = group[:-4]
128
- return group
129
-
130
- @classmethod
131
- def _store_product_state(cls, product_path: Path, version: str) -> None:
132
- state_path = product_path / "state"
133
- state_path.mkdir()
134
- version_data = {"version": f"v{version}", "date": datetime.now(timezone.utc).isoformat()}
135
- version_path = state_path / "version.json"
136
- with version_path.open("w", encoding="utf-8") as f:
137
- dump(version_data, f)
138
- f.write("\n")
139
-
140
- def _install_version_with_backup(self, version: str) -> Path | None:
141
- """Install a specific version of the transpiler, with backup handling."""
142
- logger.info(f"Installing Databricks {self._product_name} transpiler (v{version})")
143
- product_path = self._repository.transpilers_path() / self._product_name
144
- with _PathBackup(product_path) as backup:
145
- self._install_path = product_path / "lib"
146
- self._install_path.mkdir(parents=True, exist_ok=True)
147
- try:
148
- result = self._install_version(version)
149
- except (CalledProcessError, KeyError, ValueError) as e:
150
- # Warning: if you end up here under the IntelliJ/PyCharm debugger, it can be because the debugger is
151
- # trying to inject itself into the subprocess. Try disabling:
152
- # Settings | Build, Execution, Deployment | Python Debugger | Attach to subprocess automatically while debugging
153
- # Note: Subprocess output is not captured, and should already be visible in the console.
154
- logger.error(f"Failed to install {self._product_name} transpiler (v{version})", exc_info=e)
155
- result = False
156
-
157
- if result:
158
- logger.info(f"Successfully installed {self._product_name} transpiler (v{version})")
159
- self._store_product_state(product_path=product_path, version=version)
160
- backup.commit()
161
- return product_path
162
- backup.rollback()
163
- return None
164
-
165
- @abc.abstractmethod
166
- def _install_version(self, version: str) -> bool:
167
- """Install a specific version of the transpiler, returning True if successful."""
168
-
169
-
170
- class WheelInstaller(TranspilerInstaller):
171
-
172
- _venv_exec_cmd: Path
173
- """Once created, the command to run the virtual environment's Python executable."""
174
-
175
- _site_packages: Path
176
- """Once created, the path to the site-packages directory in the virtual environment."""
177
-
178
- @classmethod
179
- def get_latest_artifact_version_from_pypi(cls, product_name: str) -> str | None:
180
- try:
181
- with request.urlopen(f"https://pypi.org/pypi/{product_name}/json") as server:
182
- text: bytes = server.read()
183
- data: dict[str, Any] = loads(text)
184
- return data.get("info", {}).get('version', None)
185
- except HTTPError as e:
186
- logger.error(f"Error while fetching PyPI metadata: {product_name}", exc_info=e)
187
- return None
188
-
189
- def __init__(
190
- self,
191
- repository: TranspilerRepository,
192
- product_name: str,
193
- pypi_name: str,
194
- artifact: Path | None = None,
195
- ) -> None:
196
- super().__init__(repository, product_name)
197
- self._pypi_name = pypi_name
198
- self._artifact = artifact
199
-
200
- def install(self) -> Path | None:
201
- return self._install_checking_versions()
202
-
203
- def _install_checking_versions(self) -> Path | None:
204
- latest_version = (
205
- self.get_local_artifact_version(self._artifact)
206
- if self._artifact
207
- else self.get_latest_artifact_version_from_pypi(self._pypi_name)
208
- )
209
- if latest_version is None:
210
- logger.warning(f"Could not determine the latest version of {self._pypi_name}")
211
- logger.error(f"Failed to install transpiler: {self._product_name}")
212
- return None
213
- installed_version = self._repository.get_installed_version(self._product_name)
214
- if installed_version == latest_version:
215
- logger.info(f"{self._pypi_name} v{latest_version} already installed")
216
- return None
217
- return self._install_version_with_backup(latest_version)
218
-
219
- def _install_version(self, version: str) -> bool:
220
- self._create_venv()
221
- self._install_with_pip()
222
- self._copy_lsp_resources()
223
- return self._post_install() is not None
224
-
225
- def _create_venv(self) -> None:
226
- venv_path = self._install_path / ".venv"
227
- # Sadly, some platform-specific variations need to be dealt with:
228
- # - Windows venvs do not use symlinks, but rather copies, when populating the venv.
229
- # - The library path is different.
230
- if use_symlinks := sys.platform != "win32":
231
- major, minor = sys.version_info[:2]
232
- lib_path = venv_path / "lib" / f"python{major}.{minor}" / "site-packages"
233
- else:
234
- lib_path = venv_path / "Lib" / "site-packages"
235
- builder = venv.EnvBuilder(with_pip=True, prompt=f"{self._product_name}", symlinks=use_symlinks)
236
- builder.create(venv_path)
237
- context = builder.ensure_directories(venv_path)
238
- logger.debug(f"Created virtual environment with context: {context}")
239
- self._venv_exec_cmd = context.env_exec_cmd
240
- self._site_packages = lib_path
241
-
242
- def _install_with_pip(self) -> None:
243
- # Based on: https://pip.pypa.io/en/stable/user_guide/#using-pip-from-your-program
244
- # (But with venv_exec_cmd instead of sys.executable, so that we use the venv's pip.)
245
- to_install: Path | str = self._artifact if self._artifact is not None else self._pypi_name
246
- command: list[Path | str] = [
247
- self._venv_exec_cmd,
248
- "-m",
249
- "pip",
250
- "--disable-pip-version-check",
251
- "install",
252
- to_install,
253
- ]
254
- result = run(command, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, check=False)
255
- result.check_returncode()
256
-
257
- def _copy_lsp_resources(self):
258
- lsp = self._site_packages / "lsp"
259
- if not lsp.exists():
260
- raise ValueError("Installed transpiler is missing a 'lsp' folder")
261
- shutil.copytree(lsp, self._install_path, dirs_exist_ok=True)
262
-
263
- def _post_install(self) -> Path | None:
264
- config = self._install_path / "config.yml"
265
- if not config.exists():
266
- raise ValueError("Installed transpiler is missing a 'config.yml' file in its 'lsp' folder")
267
- install_ext = "ps1" if sys.platform == "win32" else "sh"
268
- install_script = f"installer.{install_ext}"
269
- installer_path = self._install_path / install_script
270
- if installer_path.exists():
271
- self._run_custom_installer(installer_path)
272
- return self._install_path
273
-
274
- def _run_custom_installer(self, installer_path: Path) -> None:
275
- args = [installer_path]
276
- run(args, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, cwd=self._install_path, check=True)
277
-
278
-
279
- class MavenInstaller(TranspilerInstaller):
280
- # Maven Central, base URL.
281
- _maven_central_repo: str = "https://repo.maven.apache.org/maven2/"
282
-
283
- @classmethod
284
- def _artifact_base_url(cls, group_id: str, artifact_id: str) -> str:
285
- """Construct the base URL for a Maven artifact."""
286
- # Reference: https://maven.apache.org/repositories/layout.html
287
- group_path = group_id.replace(".", "/")
288
- return f"{cls._maven_central_repo}{group_path}/{artifact_id}/"
289
-
290
- @classmethod
291
- def artifact_metadata_url(cls, group_id: str, artifact_id: str) -> str:
292
- """Get the metadata URL for a Maven artifact."""
293
- # TODO: Unit test this method.
294
- return f"{cls._artifact_base_url(group_id, artifact_id)}maven-metadata.xml"
295
-
296
- @classmethod
297
- def artifact_url(
298
- cls, group_id: str, artifact_id: str, version: str, classifier: str | None = None, extension: str = "jar"
299
- ) -> str:
300
- """Get the URL for a versioned Maven artifact."""
301
- # TODO: Unit test this method, including classifier and extension.
302
- _classifier = f"-{classifier}" if classifier else ""
303
- artifact_base_url = cls._artifact_base_url(group_id, artifact_id)
304
- return f"{artifact_base_url}{version}/{artifact_id}-{version}{_classifier}.{extension}"
305
-
306
- @classmethod
307
- def get_current_maven_artifact_version(cls, group_id: str, artifact_id: str) -> str | None:
308
- url = cls.artifact_metadata_url(group_id, artifact_id)
309
- try:
310
- with request.urlopen(url) as server:
311
- text = server.read()
312
- except HTTPError as e:
313
- logger.error(f"Error while fetching maven metadata: {group_id}:{artifact_id}", exc_info=e)
314
- return None
315
- logger.debug(f"Maven metadata for {group_id}:{artifact_id}: {text}")
316
- return cls._extract_latest_release_version(text)
317
-
318
- @classmethod
319
- def _extract_latest_release_version(cls, maven_metadata: str) -> str | None:
320
- """Extract the latest release version from Maven metadata."""
321
- # Reference: https://maven.apache.org/repositories/metadata.html#The_A_Level_Metadata
322
- # TODO: Unit test this method, to verify the sequence of things it checks for.
323
- root = ET.fromstring(maven_metadata)
324
- for label in ("release", "latest"):
325
- version = root.findtext(f"./versioning/{label}")
326
- if version is not None:
327
- return version
328
- return root.findtext("./versioning/versions/version[last()]")
329
-
330
- @classmethod
331
- def download_artifact_from_maven(
332
- cls,
333
- group_id: str,
334
- artifact_id: str,
335
- version: str,
336
- target: Path,
337
- classifier: str | None = None,
338
- extension: str = "jar",
339
- ) -> bool:
340
- if target.exists():
341
- logger.warning(f"Skipping download of {group_id}:{artifact_id}:{version}; target already exists: {target}")
342
- return True
343
- url = cls.artifact_url(group_id, artifact_id, version, classifier, extension)
344
- try:
345
- path, _ = request.urlretrieve(url)
346
- logger.debug(f"Downloaded maven artefact from {url} to {path}")
347
- except URLError as e:
348
- logger.error(f"Unable to download maven artefact: {group_id}:{artifact_id}:{version}", exc_info=e)
349
- return False
350
- logger.debug(f"Moving {path} to {target}")
351
- move(path, target)
352
- logger.info(f"Successfully installed: {group_id}:{artifact_id}:{version}")
353
- return True
354
-
355
- def __init__(
356
- self,
357
- repository: TranspilerRepository,
358
- product_name: str,
359
- group_id: str,
360
- artifact_id: str,
361
- artifact: Path | None = None,
362
- ) -> None:
363
- super().__init__(repository, product_name)
364
- self._group_id = group_id
365
- self._artifact_id = artifact_id
366
- self._artifact = artifact
367
-
368
- def install(self) -> Path | None:
369
- return self._install_checking_versions()
370
-
371
- def _install_checking_versions(self) -> Path | None:
372
- if self._artifact:
373
- latest_version = self.get_local_artifact_version(self._artifact)
374
- else:
375
- latest_version = self.get_current_maven_artifact_version(self._group_id, self._artifact_id)
376
- if latest_version is None:
377
- logger.warning(f"Could not determine the latest version of Databricks {self._product_name} transpiler")
378
- logger.error("Failed to install transpiler: Databricks {self._product_name} transpiler")
379
- return None
380
- installed_version = self._repository.get_installed_version(self._product_name)
381
- if installed_version == latest_version:
382
- logger.info(f"Databricks {self._product_name} transpiler v{latest_version} already installed")
383
- return None
384
- return self._install_version_with_backup(latest_version)
385
-
386
- def _install_version(self, version: str) -> bool:
387
- jar_file_path = self._install_path / f"{self._artifact_id}.jar"
388
- if self._artifact:
389
- logger.debug(f"Copying: {self._artifact} -> {jar_file_path}")
390
- shutil.copyfile(self._artifact, jar_file_path)
391
- elif not self.download_artifact_from_maven(self._group_id, self._artifact_id, version, jar_file_path):
392
- logger.error(f"Failed to install Databricks {self._product_name} transpiler (v{version})")
393
- return False
394
- self._copy_lsp_config(jar_file_path)
395
- return True
396
-
397
- def _copy_lsp_config(self, jar_file_path: Path) -> None:
398
- with ZipFile(jar_file_path) as zip_file:
399
- zip_file.extract("lsp/config.yml", self._install_path)
400
- shutil.move(self._install_path / "lsp" / "config.yml", self._install_path / "config.yml")
401
- os.rmdir(self._install_path / "lsp")
402
-
403
-
404
40
  class WorkspaceInstaller:
405
41
  def __init__(
406
42
  self,
@@ -412,7 +48,12 @@ class WorkspaceInstaller:
412
48
  resource_configurator: ResourceConfigurator,
413
49
  workspace_installation: WorkspaceInstallation,
414
50
  environ: dict[str, str] | None = None,
51
+ *,
415
52
  transpiler_repository: TranspilerRepository = TranspilerRepository.user_home(),
53
+ transpiler_installers: Sequence[Callable[[TranspilerRepository], TranspilerInstaller]] = (
54
+ BladebridgeInstaller,
55
+ MorpheusInstaller,
56
+ ),
416
57
  ):
417
58
  self._ws = ws
418
59
  self._prompts = prompts
@@ -422,6 +63,7 @@ class WorkspaceInstaller:
422
63
  self._resource_configurator = resource_configurator
423
64
  self._ws_installation = workspace_installation
424
65
  self._transpiler_repository = transpiler_repository
66
+ self._transpiler_installer_factories = transpiler_installers
425
67
 
426
68
  if not environ:
427
69
  environ = dict(os.environ.items())
@@ -430,15 +72,19 @@ class WorkspaceInstaller:
430
72
  msg = "WorkspaceInstaller is not supposed to be executed in Databricks Runtime"
431
73
  raise SystemExit(msg)
432
74
 
75
+ @property
76
+ def _transpiler_installers(self) -> Set[TranspilerInstaller]:
77
+ return frozenset(factory(self._transpiler_repository) for factory in self._transpiler_installer_factories)
78
+
433
79
  def run(
434
80
  self, module: str, config: LakebridgeConfiguration | None = None, artifact: str | None = None
435
81
  ) -> LakebridgeConfiguration:
436
82
  logger.debug(f"Initializing workspace installation for module: {module} (config: {config})")
437
83
  if module == "transpile" and artifact:
438
- self.install_artifact(artifact)
84
+ self._install_artifact(artifact)
439
85
  elif module in {"transpile", "all"}:
440
- self.install_bladebridge()
441
- self.install_morpheus()
86
+ for transpiler_installer in self._transpiler_installers:
87
+ transpiler_installer.install()
442
88
  if not config:
443
89
  config = self.configure(module)
444
90
  if self._is_testing():
@@ -447,123 +93,36 @@ class WorkspaceInstaller:
447
93
  logger.info("Installation completed successfully! Please refer to the documentation for the next steps.")
448
94
  return config
449
95
 
450
- def has_installed_transpilers(self) -> bool:
451
- """Detect whether there are transpilers currently installed."""
96
+ def upgrade_installed_transpilers(self) -> bool:
97
+ """Detect and upgrade, if possible and necessary, installed transpilers."""
452
98
  installed_transpilers = self._transpiler_repository.all_transpiler_names()
453
99
  if installed_transpilers:
454
100
  logger.info(f"Detected installed transpilers: {sorted(installed_transpilers)}")
455
- return bool(installed_transpilers)
456
-
457
- def install_bladebridge(self, artifact: Path | None = None) -> None:
458
- local_name = "bladebridge"
459
- pypi_name = "databricks-bb-plugin"
460
- wheel_installer = WheelInstaller(self._transpiler_repository, local_name, pypi_name, artifact)
461
- wheel_installer.install()
462
-
463
- def install_morpheus(self, artifact: Path | None = None) -> None:
464
- if not self.is_java_version_okay():
465
- logger.error(
466
- "The morpheus transpiler requires Java 11 or above. Please install Java and re-run 'install-transpile'."
467
- )
468
- return
469
- product_name = "databricks-morph-plugin"
470
- group_id = "com.databricks.labs"
471
- artifact_id = product_name
472
- maven_installer = MavenInstaller(self._transpiler_repository, product_name, group_id, artifact_id, artifact)
473
- maven_installer.install()
474
-
475
- @classmethod
476
- def is_java_version_okay(cls) -> bool:
477
- detected_java = cls.find_java()
478
- match detected_java:
479
- case None:
480
- logger.warning("No Java executable found in the system PATH.")
481
- return False
482
- case (java_executable, None):
483
- logger.warning(f"Java found, but could not determine the version: {java_executable}.")
484
- return False
485
- case (java_executable, bytes(raw_version)):
486
- logger.warning(f"Java found ({java_executable}), but could not parse the version:\n{raw_version}")
487
- return False
488
- case (java_executable, tuple(old_version)) if old_version < (11, 0, 0, 0):
489
- version_str = ".".join(str(v) for v in old_version)
490
- logger.warning(f"Java found ({java_executable}), but version {version_str} is too old.")
491
- return False
492
- case _:
493
- return True
494
-
495
- def install_artifact(self, artifact: str):
101
+ upgraded = False
102
+ for transpiler_installer in self._transpiler_installers:
103
+ name = transpiler_installer.name
104
+ if name in installed_transpilers:
105
+ logger.info(f"Checking for {name} upgrades...")
106
+ upgraded |= transpiler_installer.install()
107
+ # If we upgraded anything, the configuration process needs to run again.
108
+ if upgraded:
109
+ config = self.configure("transpile")
110
+ if not self._is_testing():
111
+ self._ws_installation.install(config)
112
+ return upgraded
113
+
114
+ def _install_artifact(self, artifact: str) -> None:
496
115
  path = Path(artifact)
497
116
  if not path.exists():
498
117
  logger.error(f"Could not locate artifact {artifact}")
499
118
  return
500
- if "databricks-morph-plugin" in path.name:
501
- self.install_morpheus(path)
502
- elif "databricks_bb_plugin" in path.name:
503
- self.install_bladebridge(path)
119
+ for transpiler_installer in self._transpiler_installers:
120
+ if transpiler_installer.can_install(path):
121
+ transpiler_installer.install(path)
122
+ break
504
123
  else:
505
124
  logger.fatal(f"Cannot install unsupported artifact: {artifact}")
506
125
 
507
- @classmethod
508
- def find_java(cls) -> tuple[Path, tuple[int, int, int, int] | bytes | None] | None:
509
- """Locate Java and return its version, as reported by `java -version`.
510
-
511
- The java executable is currently located by searching the system PATH. Its version is parsed from the output of
512
- the `java -version` command, which has been standardized since Java 10.
513
-
514
- Returns:
515
- a tuple of its path and the version as a tuple of integers (feature, interim, update, patch), if the java
516
- executable could be located. If the version cannot be parsed, instead the raw version information is
517
- returned, or `None` as a last resort. When no java executable is found, `None` is returned instead of a
518
- tuple.
519
- """
520
- # Platform-independent way to reliably locate the java executable.
521
- # Reference: https://docs.python.org/3.10/library/subprocess.html#popen-constructor
522
- java_executable = shutil.which("java")
523
- if java_executable is None:
524
- return None
525
- java_executable_path = Path(java_executable)
526
- logger.debug(f"Using java executable: {java_executable_path!r}")
527
- try:
528
- completed = run([str(java_executable_path), "-version"], shell=False, capture_output=True, check=True)
529
- except CalledProcessError as e:
530
- logger.debug(
531
- f"Failed to run {e.args!r} (exit-code={e.returncode}, stdout={e.stdout!r}, stderr={e.stderr!r})",
532
- exc_info=e,
533
- )
534
- return java_executable_path, None
535
- # It might not be ascii, but the bits we care about are so this will never fail.
536
- raw_output = completed.stderr
537
- java_version_output = raw_output.decode("ascii", errors="ignore")
538
- java_version = cls._parse_java_version(java_version_output)
539
- if java_version is None:
540
- return java_executable_path, raw_output.strip()
541
- logger.debug(f"Detected java version: {java_version}")
542
- return java_executable_path, java_version
543
-
544
- # Pattern to match a Java version string, compiled at import time to ensure it's valid.
545
- # Ref: https://docs.oracle.com/en/java/javase/11/install/version-string-format.html
546
- _java_version_pattern = re.compile(
547
- r' version "(?P<feature>\d+)(?:\.(?P<interim>\d+)(?:\.(?P<update>\d+)(?:\.(?P<patch>\d+))?)?)?"'
548
- )
549
-
550
- @classmethod
551
- def _parse_java_version(cls, version: str) -> tuple[int, int, int, int] | None:
552
- """Locate and parse the Java version in the output of `java -version`."""
553
- # Output looks like this:
554
- # openjdk version "24.0.1" 2025-04-15
555
- # OpenJDK Runtime Environment Temurin-24.0.1+9 (build 24.0.1+9)
556
- # OpenJDK 64-Bit Server VM Temurin-24.0.1+9 (build 24.0.1+9, mixed mode)
557
- match = cls._java_version_pattern.search(version)
558
- if not match:
559
- logger.debug(f"Could not parse java version: {version!r}")
560
- return None
561
- feature = int(match["feature"])
562
- interim = int(match["interim"] or 0)
563
- update = int(match["update"] or 0)
564
- patch = int(match["patch"] or 0)
565
- return feature, interim, update, patch
566
-
567
126
  def configure(self, module: str) -> LakebridgeConfiguration:
568
127
  match module:
569
128
  case "transpile":
@@ -29,6 +29,7 @@ class DataSource(ABC):
29
29
  catalog: str | None,
30
30
  schema: str,
31
31
  table: str,
32
+ normalize: bool = True,
32
33
  ) -> list[Schema]:
33
34
  return NotImplemented
34
35
 
@@ -42,16 +43,19 @@ class DataSource(ABC):
42
43
  logger.warning(error_msg)
43
44
  raise DataSourceRuntimeException(error_msg) from exception
44
45
 
45
- def _map_meta_column(self, meta_column) -> Schema:
46
+ def _map_meta_column(self, meta_column, normalize: bool) -> Schema:
46
47
  """Create a normalized Schema DTO from the database metadata
47
48
 
48
49
  Used in the implementations of get_schema to build a Schema DTO from the `INFORMATION_SCHEMA` query result.
49
50
  The returned Schema is normalized in case the database is having columns with special characters and standardize
50
51
  """
51
- name = meta_column.col_name
52
+ name = meta_column.col_name.lower()
52
53
  dtype = meta_column.data_type.strip().lower()
53
- normalized = self.normalize_identifier(name)
54
- return Schema(normalized.ansi_normalized, dtype, normalized.ansi_normalized, normalized.source_normalized)
54
+ if normalize:
55
+ normalized = self.normalize_identifier(name)
56
+ return Schema(normalized.ansi_normalized, dtype, normalized.ansi_normalized, normalized.source_normalized)
57
+
58
+ return Schema(name, dtype, name, name)
55
59
 
56
60
 
57
61
  class MockDataSource(DataSource):
@@ -80,7 +84,7 @@ class MockDataSource(DataSource):
80
84
  return self.log_and_throw_exception(self._exception, "data", f"({catalog}, {schema}, {query})")
81
85
  return mock_df
82
86
 
83
- def get_schema(self, catalog: str | None, schema: str, table: str) -> list[Schema]:
87
+ def get_schema(self, catalog: str | None, schema: str, table: str, normalize: bool = True) -> list[Schema]:
84
88
  catalog_str = catalog if catalog else ""
85
89
  mock_schema = self._schema_repository.get((catalog_str, schema, table))
86
90
  if not mock_schema:
@@ -77,6 +77,7 @@ class DatabricksDataSource(DataSource, SecretsMixin):
77
77
  catalog: str | None,
78
78
  schema: str,
79
79
  table: str,
80
+ normalize: bool = True,
80
81
  ) -> list[Schema]:
81
82
  catalog_str = catalog if catalog else "hive_metastore"
82
83
  schema_query = _get_schema_query(catalog_str, schema, table)
@@ -85,7 +86,7 @@ class DatabricksDataSource(DataSource, SecretsMixin):
85
86
  logger.info(f"Fetching Schema: Started at: {datetime.now()}")
86
87
  schema_metadata = self._spark.sql(schema_query).where("col_name not like '#%'").distinct().collect()
87
88
  logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
88
- return [self._map_meta_column(field) for field in schema_metadata]
89
+ return [self._map_meta_column(field, normalize) for field in schema_metadata]
89
90
  except (RuntimeError, PySparkException) as e:
90
91
  return self.log_and_throw_exception(e, "schema", schema_query)
91
92
 
@@ -81,6 +81,7 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
81
81
  catalog: str | None,
82
82
  schema: str,
83
83
  table: str,
84
+ normalize: bool = True,
84
85
  ) -> list[Schema]:
85
86
  schema_query = re.sub(
86
87
  r'\s+',
@@ -94,7 +95,7 @@ class OracleDataSource(DataSource, SecretsMixin, JDBCReaderMixin):
94
95
  schema_metadata = df.select([col(c).alias(c.lower()) for c in df.columns]).collect()
95
96
  logger.info(f"Schema fetched successfully. Completed at: {datetime.now()}")
96
97
  logger.debug(f"schema_metadata: ${schema_metadata}")
97
- return [self._map_meta_column(field) for field in schema_metadata]
98
+ return [self._map_meta_column(field, normalize) for field in schema_metadata]
98
99
  except (RuntimeError, PySparkException) as e:
99
100
  return self.log_and_throw_exception(e, "schema", schema_query)
100
101