databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. databricks/labs/lakebridge/__about__.py +1 -1
  2. databricks/labs/lakebridge/analyzer/__init__.py +0 -0
  3. databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
  4. databricks/labs/lakebridge/assessments/profiler_validator.py +103 -0
  5. databricks/labs/lakebridge/base_install.py +20 -3
  6. databricks/labs/lakebridge/cli.py +32 -59
  7. databricks/labs/lakebridge/contexts/application.py +7 -0
  8. databricks/labs/lakebridge/deployment/job.py +2 -2
  9. databricks/labs/lakebridge/helpers/file_utils.py +36 -0
  10. databricks/labs/lakebridge/helpers/validation.py +5 -3
  11. databricks/labs/lakebridge/install.py +73 -484
  12. databricks/labs/lakebridge/reconcile/compare.py +70 -33
  13. databricks/labs/lakebridge/reconcile/connectors/data_source.py +24 -1
  14. databricks/labs/lakebridge/reconcile/connectors/databricks.py +12 -1
  15. databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
  16. databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
  17. databricks/labs/lakebridge/reconcile/connectors/oracle.py +12 -1
  18. databricks/labs/lakebridge/reconcile/connectors/secrets.py +19 -1
  19. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +63 -30
  20. databricks/labs/lakebridge/reconcile/connectors/tsql.py +28 -2
  21. databricks/labs/lakebridge/reconcile/constants.py +4 -3
  22. databricks/labs/lakebridge/reconcile/execute.py +9 -810
  23. databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
  24. databricks/labs/lakebridge/reconcile/query_builder/base.py +53 -18
  25. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +8 -2
  26. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +7 -13
  27. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +18 -19
  28. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +36 -15
  29. databricks/labs/lakebridge/reconcile/recon_config.py +3 -15
  30. databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
  31. databricks/labs/lakebridge/reconcile/reconciliation.py +511 -0
  32. databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
  33. databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +78 -0
  34. databricks/labs/lakebridge/reconcile/trigger_recon_service.py +256 -0
  35. databricks/labs/lakebridge/reconcile/utils.py +38 -0
  36. databricks/labs/lakebridge/transpiler/execute.py +34 -28
  37. databricks/labs/lakebridge/transpiler/installers.py +523 -0
  38. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +47 -60
  39. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
  40. databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
  41. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/METADATA +1 -1
  42. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/RECORD +46 -35
  43. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/WHEEL +0 -0
  44. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/entry_points.txt +0 -0
  45. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/LICENSE +0 -0
  46. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,523 @@
1
+ import abc
2
+ import datetime as dt
3
+ import logging
4
+ import os
5
+ import re
6
+ import shutil
7
+ import subprocess
8
+ import sys
9
+ import venv
10
+ import xml.etree.ElementTree as ET
11
+ from json import dump, loads
12
+ from pathlib import Path
13
+ from shutil import rmtree
14
+ from typing import Any, Literal
15
+ from urllib import request
16
+ from urllib.error import HTTPError, URLError
17
+ from zipfile import ZipFile
18
+
19
+ from databricks.labs.lakebridge.transpiler.repository import TranspilerRepository
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class _PathBackup:
25
+ """A context manager to preserve a path before performing an operation, and optionally restore it afterwards."""
26
+
27
+ def __init__(self, path: Path) -> None:
28
+ self._path = path
29
+ self._backup_path: Path | None = None
30
+ self._finished = False
31
+
32
+ def __enter__(self) -> "_PathBackup":
33
+ self.start()
34
+ return self
35
+
36
+ def start(self) -> None:
37
+ """Start the backup process by creating a backup of the path, if it already exists."""
38
+ backup_path = self._path.with_name(f"{self._path.name}-saved")
39
+ if backup_path.exists():
40
+ logger.debug(f"Existing backup found, removing: {backup_path}")
41
+ rmtree(backup_path)
42
+ if self._path.exists():
43
+ logger.debug(f"Backing up existing path: {self._path} -> {backup_path}")
44
+ os.rename(self._path, backup_path)
45
+ self._backup_path = backup_path
46
+ else:
47
+ self._backup_path = None
48
+
49
+ def rollback(self) -> None:
50
+ """Rollback the operation by restoring the backup path, if it exists."""
51
+ assert not self._finished, "Can only rollback/commit once."
52
+ logger.debug(f"Removing path: {self._path}")
53
+ rmtree(self._path)
54
+ if self._backup_path is not None:
55
+ logger.debug(f"Restoring previous path: {self._backup_path} -> {self._path}")
56
+ os.rename(self._backup_path, self._path)
57
+ self._backup_path = None
58
+ self._finished = True
59
+
60
+ def commit(self) -> None:
61
+ """Commit the operation by removing the backup path, if it exists."""
62
+ assert not self._finished, "Can only rollback/commit once."
63
+ if self._backup_path is not None:
64
+ logger.debug(f"Removing backup path: {self._backup_path}")
65
+ rmtree(self._backup_path)
66
+ self._backup_path = None
67
+ self._finished = True
68
+
69
+ def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]:
70
+ if not self._finished:
71
+ # Automatically commit or rollback based on whether an exception is underway.
72
+ if exc_val is None:
73
+ self.commit()
74
+ else:
75
+ self.rollback()
76
+ return False # Do not suppress any exception underway
77
+
78
+
79
+ class ArtifactInstaller(abc.ABC):
80
+
81
+ # TODO: Remove these properties when post-install is removed.
82
+ _install_path: Path
83
+ """The path where the transpiler is being installed, once this starts."""
84
+
85
+ def __init__(self, repository: TranspilerRepository, product_name: str) -> None:
86
+ self._repository = repository
87
+ self._product_name = product_name
88
+
89
+ _version_pattern = re.compile(r"[_-](\d+(?:[.\-_]\w*\d+)+)")
90
+
91
+ @classmethod
92
+ def get_local_artifact_version(cls, artifact: Path) -> str | None:
93
+ # TODO: Get the version from the metadata inside the artifact rather than relying on the filename.
94
+ match = cls._version_pattern.search(artifact.stem)
95
+ if not match:
96
+ return None
97
+ group = match.group(0)
98
+ if not group:
99
+ return None
100
+ # TODO: Update the regex to take care of these trimming scenarios.
101
+ if group.startswith('-'):
102
+ group = group[1:]
103
+ if group.endswith("-py3"):
104
+ group = group[:-4]
105
+ return group
106
+
107
+ @classmethod
108
+ def _store_product_state(cls, product_path: Path, version: str) -> None:
109
+ state_path = product_path / "state"
110
+ state_path.mkdir()
111
+ version_data = {"version": f"v{version}", "date": dt.datetime.now(dt.timezone.utc).isoformat()}
112
+ version_path = state_path / "version.json"
113
+ with version_path.open("w", encoding="utf-8") as f:
114
+ dump(version_data, f)
115
+ f.write("\n")
116
+
117
+ def _install_version_with_backup(self, version: str) -> Path | None:
118
+ """Install a specific version of the transpiler, with backup handling."""
119
+ logger.info(f"Installing Databricks {self._product_name} transpiler (v{version})")
120
+ product_path = self._repository.transpilers_path() / self._product_name
121
+ with _PathBackup(product_path) as backup:
122
+ self._install_path = product_path / "lib"
123
+ self._install_path.mkdir(parents=True, exist_ok=True)
124
+ try:
125
+ result = self._install_version(version)
126
+ except (subprocess.CalledProcessError, KeyError, ValueError) as e:
127
+ # Warning: if you end up here under the IntelliJ/PyCharm debugger, it can be because the debugger is
128
+ # trying to inject itself into the subprocess. Try disabling:
129
+ # Settings | Build, Execution, Deployment | Python Debugger | Attach to subprocess automatically while debugging
130
+ # Note: Subprocess output is not captured, and should already be visible in the console.
131
+ logger.error(f"Failed to install {self._product_name} transpiler (v{version})", exc_info=e)
132
+ result = False
133
+
134
+ if result:
135
+ logger.info(f"Successfully installed {self._product_name} transpiler (v{version})")
136
+ self._store_product_state(product_path=product_path, version=version)
137
+ backup.commit()
138
+ return product_path
139
+ backup.rollback()
140
+ return None
141
+
142
+ @abc.abstractmethod
143
+ def _install_version(self, version: str) -> bool:
144
+ """Install a specific version of the transpiler, returning True if successful."""
145
+
146
+
147
+ class WheelInstaller(ArtifactInstaller):
148
+
149
+ _venv_exec_cmd: Path
150
+ """Once created, the command to run the virtual environment's Python executable."""
151
+
152
+ _site_packages: Path
153
+ """Once created, the path to the site-packages directory in the virtual environment."""
154
+
155
+ @classmethod
156
+ def get_latest_artifact_version_from_pypi(cls, product_name: str) -> str | None:
157
+ try:
158
+ with request.urlopen(f"https://pypi.org/pypi/{product_name}/json") as server:
159
+ text: bytes = server.read()
160
+ data: dict[str, Any] = loads(text)
161
+ return data.get("info", {}).get('version', None)
162
+ except HTTPError as e:
163
+ logger.error(f"Error while fetching PyPI metadata: {product_name}", exc_info=e)
164
+ return None
165
+
166
+ def __init__(
167
+ self,
168
+ repository: TranspilerRepository,
169
+ product_name: str,
170
+ pypi_name: str,
171
+ artifact: Path | None = None,
172
+ ) -> None:
173
+ super().__init__(repository, product_name)
174
+ self._pypi_name = pypi_name
175
+ self._artifact = artifact
176
+
177
+ def install(self) -> Path | None:
178
+ return self._install_checking_versions()
179
+
180
+ def _install_checking_versions(self) -> Path | None:
181
+ latest_version = (
182
+ self.get_local_artifact_version(self._artifact)
183
+ if self._artifact
184
+ else self.get_latest_artifact_version_from_pypi(self._pypi_name)
185
+ )
186
+ if latest_version is None:
187
+ logger.warning(f"Could not determine the latest version of {self._pypi_name}")
188
+ logger.error(f"Failed to install transpiler: {self._product_name}")
189
+ return None
190
+ installed_version = self._repository.get_installed_version(self._product_name)
191
+ if installed_version == latest_version:
192
+ logger.info(f"{self._pypi_name} v{latest_version} already installed")
193
+ return None
194
+ return self._install_version_with_backup(latest_version)
195
+
196
+ def _install_version(self, version: str) -> bool:
197
+ self._create_venv()
198
+ self._install_with_pip()
199
+ self._copy_lsp_resources()
200
+ return self._post_install() is not None
201
+
202
+ def _create_venv(self) -> None:
203
+ venv_path = self._install_path / ".venv"
204
+ # Sadly, some platform-specific variations need to be dealt with:
205
+ # - Windows venvs do not use symlinks, but rather copies, when populating the venv.
206
+ # - The library path is different.
207
+ if use_symlinks := sys.platform != "win32":
208
+ major, minor = sys.version_info[:2]
209
+ lib_path = venv_path / "lib" / f"python{major}.{minor}" / "site-packages"
210
+ else:
211
+ lib_path = venv_path / "Lib" / "site-packages"
212
+ builder = venv.EnvBuilder(with_pip=True, prompt=f"{self._product_name}", symlinks=use_symlinks)
213
+ builder.create(venv_path)
214
+ context = builder.ensure_directories(venv_path)
215
+ logger.debug(f"Created virtual environment with context: {context}")
216
+ self._venv_exec_cmd = context.env_exec_cmd
217
+ self._site_packages = lib_path
218
+
219
+ def _install_with_pip(self) -> None:
220
+ # Based on: https://pip.pypa.io/en/stable/user_guide/#using-pip-from-your-program
221
+ # (But with venv_exec_cmd instead of sys.executable, so that we use the venv's pip.)
222
+ to_install: Path | str = self._artifact if self._artifact is not None else self._pypi_name
223
+ command: list[Path | str] = [
224
+ self._venv_exec_cmd,
225
+ "-m",
226
+ "pip",
227
+ "--disable-pip-version-check",
228
+ "install",
229
+ to_install,
230
+ ]
231
+ result = subprocess.run(command, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, check=False)
232
+ result.check_returncode()
233
+
234
+ def _copy_lsp_resources(self):
235
+ lsp = self._site_packages / "lsp"
236
+ if not lsp.exists():
237
+ raise ValueError("Installed transpiler is missing a 'lsp' folder")
238
+ shutil.copytree(lsp, self._install_path, dirs_exist_ok=True)
239
+
240
+ def _post_install(self) -> Path | None:
241
+ config = self._install_path / "config.yml"
242
+ if not config.exists():
243
+ raise ValueError("Installed transpiler is missing a 'config.yml' file in its 'lsp' folder")
244
+ install_ext = "ps1" if sys.platform == "win32" else "sh"
245
+ install_script = f"installer.{install_ext}"
246
+ installer_path = self._install_path / install_script
247
+ if installer_path.exists():
248
+ self._run_custom_installer(installer_path)
249
+ return self._install_path
250
+
251
+ def _run_custom_installer(self, installer_path: Path) -> None:
252
+ args = [installer_path]
253
+ subprocess.run(args, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, cwd=self._install_path, check=True)
254
+
255
+
256
+ class MavenInstaller(ArtifactInstaller):
257
+ # Maven Central, base URL.
258
+ _maven_central_repo: str = "https://repo.maven.apache.org/maven2/"
259
+
260
+ @classmethod
261
+ def _artifact_base_url(cls, group_id: str, artifact_id: str) -> str:
262
+ """Construct the base URL for a Maven artifact."""
263
+ # Reference: https://maven.apache.org/repositories/layout.html
264
+ group_path = group_id.replace(".", "/")
265
+ return f"{cls._maven_central_repo}{group_path}/{artifact_id}/"
266
+
267
+ @classmethod
268
+ def artifact_metadata_url(cls, group_id: str, artifact_id: str) -> str:
269
+ """Get the metadata URL for a Maven artifact."""
270
+ # TODO: Unit test this method.
271
+ return f"{cls._artifact_base_url(group_id, artifact_id)}maven-metadata.xml"
272
+
273
+ @classmethod
274
+ def artifact_url(
275
+ cls, group_id: str, artifact_id: str, version: str, classifier: str | None = None, extension: str = "jar"
276
+ ) -> str:
277
+ """Get the URL for a versioned Maven artifact."""
278
+ # TODO: Unit test this method, including classifier and extension.
279
+ _classifier = f"-{classifier}" if classifier else ""
280
+ artifact_base_url = cls._artifact_base_url(group_id, artifact_id)
281
+ return f"{artifact_base_url}{version}/{artifact_id}-{version}{_classifier}.{extension}"
282
+
283
+ @classmethod
284
+ def get_current_maven_artifact_version(cls, group_id: str, artifact_id: str) -> str | None:
285
+ url = cls.artifact_metadata_url(group_id, artifact_id)
286
+ try:
287
+ with request.urlopen(url) as server:
288
+ text = server.read()
289
+ except HTTPError as e:
290
+ logger.error(f"Error while fetching maven metadata: {group_id}:{artifact_id}", exc_info=e)
291
+ return None
292
+ logger.debug(f"Maven metadata for {group_id}:{artifact_id}: {text}")
293
+ return cls._extract_latest_release_version(text)
294
+
295
+ @classmethod
296
+ def _extract_latest_release_version(cls, maven_metadata: str) -> str | None:
297
+ """Extract the latest release version from Maven metadata."""
298
+ # Reference: https://maven.apache.org/repositories/metadata.html#The_A_Level_Metadata
299
+ # TODO: Unit test this method, to verify the sequence of things it checks for.
300
+ root = ET.fromstring(maven_metadata)
301
+ for label in ("release", "latest"):
302
+ version = root.findtext(f"./versioning/{label}")
303
+ if version is not None:
304
+ return version
305
+ return root.findtext("./versioning/versions/version[last()]")
306
+
307
+ @classmethod
308
+ def download_artifact_from_maven(
309
+ cls,
310
+ group_id: str,
311
+ artifact_id: str,
312
+ version: str,
313
+ target: Path,
314
+ classifier: str | None = None,
315
+ extension: str = "jar",
316
+ ) -> bool:
317
+ if target.exists():
318
+ logger.warning(f"Skipping download of {group_id}:{artifact_id}:{version}; target already exists: {target}")
319
+ return True
320
+ url = cls.artifact_url(group_id, artifact_id, version, classifier, extension)
321
+ try:
322
+ path, _ = request.urlretrieve(url)
323
+ logger.debug(f"Downloaded maven artefact from {url} to {path}")
324
+ except URLError as e:
325
+ logger.error(f"Unable to download maven artefact: {group_id}:{artifact_id}:{version}", exc_info=e)
326
+ return False
327
+ logger.debug(f"Moving {path} to {target}")
328
+ shutil.move(path, target)
329
+ logger.info(f"Successfully installed: {group_id}:{artifact_id}:{version}")
330
+ return True
331
+
332
+ def __init__(
333
+ self,
334
+ repository: TranspilerRepository,
335
+ product_name: str,
336
+ group_id: str,
337
+ artifact_id: str,
338
+ artifact: Path | None = None,
339
+ ) -> None:
340
+ super().__init__(repository, product_name)
341
+ self._group_id = group_id
342
+ self._artifact_id = artifact_id
343
+ self._artifact = artifact
344
+
345
+ def install(self) -> Path | None:
346
+ return self._install_checking_versions()
347
+
348
+ def _install_checking_versions(self) -> Path | None:
349
+ if self._artifact:
350
+ latest_version = self.get_local_artifact_version(self._artifact)
351
+ else:
352
+ latest_version = self.get_current_maven_artifact_version(self._group_id, self._artifact_id)
353
+ if latest_version is None:
354
+ logger.warning(f"Could not determine the latest version of Databricks {self._product_name} transpiler")
355
+ logger.error("Failed to install transpiler: Databricks {self._product_name} transpiler")
356
+ return None
357
+ installed_version = self._repository.get_installed_version(self._product_name)
358
+ if installed_version == latest_version:
359
+ logger.info(f"Databricks {self._product_name} transpiler v{latest_version} already installed")
360
+ return None
361
+ return self._install_version_with_backup(latest_version)
362
+
363
+ def _install_version(self, version: str) -> bool:
364
+ jar_file_path = self._install_path / f"{self._artifact_id}.jar"
365
+ if self._artifact:
366
+ logger.debug(f"Copying: {self._artifact} -> {jar_file_path}")
367
+ shutil.copyfile(self._artifact, jar_file_path)
368
+ elif not self.download_artifact_from_maven(self._group_id, self._artifact_id, version, jar_file_path):
369
+ logger.error(f"Failed to install Databricks {self._product_name} transpiler (v{version})")
370
+ return False
371
+ self._copy_lsp_config(jar_file_path)
372
+ return True
373
+
374
+ def _copy_lsp_config(self, jar_file_path: Path) -> None:
375
+ with ZipFile(jar_file_path) as zip_file:
376
+ zip_file.extract("lsp/config.yml", self._install_path)
377
+ shutil.move(self._install_path / "lsp" / "config.yml", self._install_path / "config.yml")
378
+ os.rmdir(self._install_path / "lsp")
379
+
380
+
381
+ class TranspilerInstaller(abc.ABC):
382
+ def __init__(self, transpiler_repository: TranspilerRepository) -> None:
383
+ self._transpiler_repository = transpiler_repository
384
+
385
+ @property
386
+ @abc.abstractmethod
387
+ def name(self) -> str:
388
+ """The name of this transpiler, as noted in its internal configuration."""
389
+
390
+ @abc.abstractmethod
391
+ def can_install(self, artifact: Path) -> bool:
392
+ """Check whether the given path is an artifact that can be installed by this installed."""
393
+
394
+ @abc.abstractmethod
395
+ def install(self, artifact: Path | None = None) -> bool:
396
+ """Install or upgrade a transpiler.
397
+
398
+ This method is responsible for installing a transpiler, including obtaining any necessary online artifacts.
399
+
400
+ Args:
401
+ artifact: An optional local path for the transpiler artifact, if it should be used instead of the online
402
+ artifact.
403
+ Returns:
404
+ True if the transpiler was installed or updated, or False if the transpiler was already up-to-date.
405
+ """
406
+
407
+
408
+ class BladebridgeInstaller(TranspilerInstaller):
409
+ @property
410
+ def name(self) -> str:
411
+ return "Bladebridge"
412
+
413
+ def can_install(self, artifact: Path) -> bool:
414
+ return "databricks_bb_plugin" in artifact.name and artifact.suffix == ".whl"
415
+
416
+ def install(self, artifact: Path | None = None) -> bool:
417
+ local_name = "bladebridge"
418
+ pypi_name = "databricks-bb-plugin"
419
+ wheel_installer = WheelInstaller(self._transpiler_repository, local_name, pypi_name, artifact)
420
+ return wheel_installer.install() is not None
421
+
422
+
423
+ class MorpheusInstaller(TranspilerInstaller):
424
+ @property
425
+ def name(self) -> str:
426
+ return "Morpheus"
427
+
428
+ def can_install(self, artifact: Path) -> bool:
429
+ return "databricks-morph-plugin" in artifact.name and artifact.suffix == ".jar"
430
+
431
+ def install(self, artifact: Path | None = None) -> bool:
432
+ if not self.is_java_version_okay():
433
+ logger.error(
434
+ "The morpheus transpiler requires Java 11 or above. Please install Java and re-run 'install-transpile'."
435
+ )
436
+ return False
437
+ product_name = "databricks-morph-plugin"
438
+ group_id = "com.databricks.labs"
439
+ artifact_id = product_name
440
+ maven_installer = MavenInstaller(self._transpiler_repository, product_name, group_id, artifact_id, artifact)
441
+ return maven_installer.install() is not None
442
+
443
+ @classmethod
444
+ def is_java_version_okay(cls) -> bool:
445
+ detected_java = cls.find_java()
446
+ match detected_java:
447
+ case None:
448
+ logger.warning("No Java executable found in the system PATH.")
449
+ return False
450
+ case (java_executable, None):
451
+ logger.warning(f"Java found, but could not determine the version: {java_executable}.")
452
+ return False
453
+ case (java_executable, bytes(raw_version)):
454
+ logger.warning(f"Java found ({java_executable}), but could not parse the version:\n{raw_version}")
455
+ return False
456
+ case (java_executable, tuple(old_version)) if old_version < (11, 0, 0, 0):
457
+ version_str = ".".join(str(v) for v in old_version)
458
+ logger.warning(f"Java found ({java_executable}), but version {version_str} is too old.")
459
+ return False
460
+ case _:
461
+ return True
462
+
463
+ @classmethod
464
+ def find_java(cls) -> tuple[Path, tuple[int, int, int, int] | bytes | None] | None:
465
+ """Locate Java and return its version, as reported by `java -version`.
466
+
467
+ The java executable is currently located by searching the system PATH. Its version is parsed from the output of
468
+ the `java -version` command, which has been standardized since Java 10.
469
+
470
+ Returns:
471
+ a tuple of its path and the version as a tuple of integers (feature, interim, update, patch), if the java
472
+ executable could be located. If the version cannot be parsed, instead the raw version information is
473
+ returned, or `None` as a last resort. When no java executable is found, `None` is returned instead of a
474
+ tuple.
475
+ """
476
+ # Platform-independent way to reliably locate the java executable.
477
+ # Reference: https://docs.python.org/3.10/library/subprocess.html#popen-constructor
478
+ java_executable = shutil.which("java")
479
+ if java_executable is None:
480
+ return None
481
+ java_executable_path = Path(java_executable)
482
+ logger.debug(f"Using java executable: {java_executable_path!r}")
483
+ try:
484
+ completed = subprocess.run(
485
+ [str(java_executable_path), "-version"], shell=False, capture_output=True, check=True
486
+ )
487
+ except subprocess.CalledProcessError as e:
488
+ logger.debug(
489
+ f"Failed to run {e.args!r} (exit-code={e.returncode}, stdout={e.stdout!r}, stderr={e.stderr!r})",
490
+ exc_info=e,
491
+ )
492
+ return java_executable_path, None
493
+ # It might not be ascii, but the bits we care about are so this will never fail.
494
+ raw_output = completed.stderr
495
+ java_version_output = raw_output.decode("ascii", errors="ignore")
496
+ java_version = cls._parse_java_version(java_version_output)
497
+ if java_version is None:
498
+ return java_executable_path, raw_output.strip()
499
+ logger.debug(f"Detected java version: {java_version}")
500
+ return java_executable_path, java_version
501
+
502
+ # Pattern to match a Java version string, compiled at import time to ensure it's valid.
503
+ # Ref: https://docs.oracle.com/en/java/javase/11/install/version-string-format.html
504
+ _java_version_pattern = re.compile(
505
+ r' version "(?P<feature>\d+)(?:\.(?P<interim>\d+)(?:\.(?P<update>\d+)(?:\.(?P<patch>\d+))?)?)?"'
506
+ )
507
+
508
+ @classmethod
509
+ def _parse_java_version(cls, version: str) -> tuple[int, int, int, int] | None:
510
+ """Locate and parse the Java version in the output of `java -version`."""
511
+ # Output looks like this:
512
+ # openjdk version "24.0.1" 2025-04-15
513
+ # OpenJDK Runtime Environment Temurin-24.0.1+9 (build 24.0.1+9)
514
+ # OpenJDK 64-Bit Server VM Temurin-24.0.1+9 (build 24.0.1+9, mixed mode)
515
+ match = cls._java_version_pattern.search(version)
516
+ if not match:
517
+ logger.debug(f"Could not parse java version: {version!r}")
518
+ return None
519
+ feature = int(match["feature"])
520
+ interim = int(match["interim"] or 0)
521
+ update = int(match["update"] or 0)
522
+ patch = int(match["patch"] or 0)
523
+ return feature, interim, update, patch
@@ -4,7 +4,9 @@ import abc
4
4
  import asyncio
5
5
  import logging
6
6
  import os
7
+ import shutil
7
8
  import sys
9
+ import venv
8
10
  from collections.abc import Callable, Sequence, Mapping
9
11
  from dataclasses import dataclass
10
12
  from pathlib import Path
@@ -35,7 +37,7 @@ from pygls.lsp.client import BaseLanguageClient
35
37
  from databricks.labs.blueprint.wheels import ProductInfo
36
38
  from databricks.labs.lakebridge.config import LSPConfigOptionV1, TranspileConfig, TranspileResult
37
39
  from databricks.labs.lakebridge.errors.exceptions import IllegalStateException
38
- from databricks.labs.lakebridge.helpers.file_utils import chdir, is_dbt_project_file, is_sql_file
40
+ from databricks.labs.lakebridge.helpers.file_utils import is_dbt_project_file, is_sql_file
39
41
  from databricks.labs.lakebridge.transpiler.transpile_engine import TranspileEngine
40
42
  from databricks.labs.lakebridge.transpiler.transpile_status import (
41
43
  CodePosition,
@@ -409,9 +411,7 @@ class LSPEngine(TranspileEngine):
409
411
  if self.is_alive:
410
412
  raise IllegalStateException("LSP engine is already initialized")
411
413
  try:
412
- # TODO: Avoid this by setting the working directory when launching the child process.
413
- with chdir(self._workdir):
414
- await self._do_initialize(config)
414
+ await self._do_initialize(config)
415
415
  await self._await_for_transpile_capability()
416
416
  # it is good practice to catch broad exceptions raised by launching a child process
417
417
  except Exception as e: # pylint: disable=broad-exception-caught
@@ -432,65 +432,52 @@ class LSPEngine(TranspileEngine):
432
432
  logger.debug(f"LSP init params: {params}")
433
433
  self._init_response = await self._client.initialize_async(params)
434
434
 
435
- async def _start_server(self):
436
- executable = self._config.remorph.command_line[0]
437
- if executable in {"python", "python3"}:
438
- await self._start_python_server()
439
- else:
440
- await self._start_other_server()
441
-
442
- async def _start_python_server(self):
443
- has_venv = (self._workdir / ".venv").exists()
444
- if has_venv:
445
- await self._start_python_server_with_venv()
446
- else:
447
- await self._start_python_server_without_venv()
448
-
449
- async def _start_python_server_with_venv(self):
450
- env: dict[str, str] = os.environ | self._config.remorph.env_vars
451
- # ensure modules are searched within venv
452
- if "PYTHONPATH" in env.keys():
453
- del env["PYTHONPATH"]
454
- if "VIRTUAL_ENV" in env.keys():
455
- del env["VIRTUAL_ENV"]
456
- if "VIRTUAL_ENV_PROMPT" in env.keys():
457
- del env["VIRTUAL_ENV_PROMPT"]
458
- path = self._workdir / ".venv" / "Scripts" if sys.platform == "win32" else self._workdir / ".venv" / "bin"
459
- if "PATH" in env.keys():
460
- env["PATH"] = str(path) + os.pathsep + env["PATH"]
461
- else:
462
- env["PATH"] = str(path)
463
- python = "python.exe" if sys.platform == "win32" else "python3"
464
- executable = path / python
465
- await self._launch_executable(executable, env)
466
-
467
- async def _start_python_server_without_venv(self):
468
- env: dict[str, str] = os.environ | self._config.remorph.env_vars
469
- # ensure modules are searched locally before being searched in remorph
470
- if "PYTHONPATH" in env.keys():
471
- env["PYTHONPATH"] = str(self._workdir) + os.pathsep + env["PYTHONPATH"]
472
- else:
473
- env["PYTHONPATH"] = str(self._workdir)
474
- executable = Path(self._config.remorph.command_line[0])
475
- await self._launch_executable(executable, env)
435
+ async def _start_server(self) -> None:
436
+ # Sanity-check and split the command-line into components.
437
+ if not (command_line := self._config.remorph.command_line):
438
+ raise ValueError(f"Missing command line for LSP server: {self._config.path}")
439
+ executable, *args = command_line
476
440
 
477
- async def _start_other_server(self):
441
+ # Extract the environment, preparing to ensure that PATH is set correctly.
478
442
  env: dict[str, str] = os.environ | self._config.remorph.env_vars
479
- # ensure modules are searched within venv
480
- if "PYTHONPATH" in env.keys():
481
- del env["PYTHONPATH"]
482
- if "VIRTUAL_ENV" in env.keys():
483
- del env["VIRTUAL_ENV"]
484
- if "VIRTUAL_ENV_PROMPT" in env.keys():
485
- del env["VIRTUAL_ENV_PROMPT"]
486
- executable = Path(self._config.remorph.command_line[0])
487
- await self._launch_executable(executable, env)
488
-
489
- async def _launch_executable(self, executable: Path, env: Mapping):
443
+ path = env.get("PATH", os.defpath)
444
+
445
+ # If we have a virtual environment, ensure the bin directory is first on the PATH. This normally takes
446
+ # care of python executables, but also deals with any entry-points that the LSP server might install.
447
+ if (venv_path := self._workdir / ".venv").exists():
448
+ executable, additional_path = self._activate_venv(venv_path, executable)
449
+ # Ensure PATH is in sync with the search path we will use to locate the LSP server executable.
450
+ env["PATH"] = path = f"{additional_path}{os.pathsep}{path}"
451
+ logger.debug(f"Using PATH for launching LSP server: {path}")
452
+
453
+ # Locate the LSP server executable in a platform-independent way.
454
+ # Reference: https://docs.python.org/3/library/subprocess.html#popen-constructor
455
+ executable = shutil.which(executable, path=path) or executable
456
+
457
+ await self._launch_executable(executable, args, env)
458
+
459
+ @staticmethod
460
+ def _activate_venv(venv_path: Path, executable: str) -> tuple[str, Path]:
461
+ """Obtain the bin/script directory for the virtual environment, to extend the search path."""
462
+ logger.debug(f"Detected virtual environment to use at: {venv_path}")
463
+ use_symlinks = sys.platform != "win32"
464
+ builder = venv.EnvBuilder(symlinks=use_symlinks)
465
+ context = builder.ensure_directories(venv_path)
466
+
467
+ # Workaround for Windows, where bin_path (Scripts/) doesn't contain python3.exe: if the executable is python
468
+ # or python3, we substitute it for what is needed to launch the venv's python interpreter.
469
+ if os.path.normcase(executable) in {"python", "python3"}:
470
+ executable = context.env_exec_cmd
471
+
472
+ return executable, context.bin_path
473
+
474
+ async def _launch_executable(self, executable: str, args: Sequence[str], env: Mapping[str, str]) -> None:
490
475
  log_level = logging.getLevelName(logging.getLogger("databricks").level)
491
- args = self._config.remorph.command_line[1:] + [f"--log_level={log_level}"]
492
- logger.debug(f"Starting LSP engine: {executable} {args} (cwd={os.getcwd()})")
493
- await self._client.start_io(str(executable), env=env, *args)
476
+ # TODO: Remove the --log_level argument once all our transpilers support the environment variable.
477
+ args = [*args, f"--log_level={log_level}"]
478
+ env = {**env, "DATABRICKS_LAKEBRIDGE_LOG_LEVEL": log_level}
479
+ logger.debug(f"Starting LSP engine: {executable} {args} (cwd={self._workdir})")
480
+ await self._client.start_io(executable, *args, env=env, cwd=self._workdir)
494
481
 
495
482
  def _client_capabilities(self):
496
483
  return ClientCapabilities() # TODO do we need to refine this ?
@@ -18,6 +18,8 @@ SQLGLOT_DIALECTS: dict[str, type[Dialect] | str] = {
18
18
  "teradata": Dialects.TERADATA,
19
19
  "trino": Dialects.TRINO,
20
20
  "tsql": Dialects.TSQL,
21
+ "mssql": Dialects.TSQL,
22
+ "synapse": Dialects.TSQL,
21
23
  "vertica": Dialects.POSTGRES,
22
24
  }
23
25