databricks-labs-lakebridge 0.10.6__py3-none-any.whl → 0.10.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. databricks/labs/lakebridge/__about__.py +1 -1
  2. databricks/labs/lakebridge/analyzer/__init__.py +0 -0
  3. databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +95 -0
  4. databricks/labs/lakebridge/base_install.py +24 -3
  5. databricks/labs/lakebridge/cli.py +19 -53
  6. databricks/labs/lakebridge/contexts/application.py +7 -0
  7. databricks/labs/lakebridge/deployment/job.py +2 -2
  8. databricks/labs/lakebridge/helpers/file_utils.py +36 -0
  9. databricks/labs/lakebridge/install.py +187 -157
  10. databricks/labs/lakebridge/reconcile/compare.py +70 -33
  11. databricks/labs/lakebridge/reconcile/connectors/data_source.py +19 -0
  12. databricks/labs/lakebridge/reconcile/connectors/databricks.py +11 -1
  13. databricks/labs/lakebridge/reconcile/connectors/dialect_utils.py +126 -0
  14. databricks/labs/lakebridge/reconcile/connectors/models.py +7 -0
  15. databricks/labs/lakebridge/reconcile/connectors/oracle.py +11 -1
  16. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +14 -2
  17. databricks/labs/lakebridge/reconcile/connectors/tsql.py +27 -2
  18. databricks/labs/lakebridge/reconcile/constants.py +4 -3
  19. databricks/labs/lakebridge/reconcile/execute.py +9 -810
  20. databricks/labs/lakebridge/reconcile/normalize_recon_config_service.py +133 -0
  21. databricks/labs/lakebridge/reconcile/query_builder/base.py +3 -7
  22. databricks/labs/lakebridge/reconcile/recon_config.py +3 -0
  23. databricks/labs/lakebridge/reconcile/recon_output_config.py +2 -1
  24. databricks/labs/lakebridge/reconcile/reconciliation.py +508 -0
  25. databricks/labs/lakebridge/reconcile/schema_compare.py +26 -19
  26. databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +98 -0
  27. databricks/labs/lakebridge/reconcile/trigger_recon_service.py +253 -0
  28. databricks/labs/lakebridge/reconcile/utils.py +38 -0
  29. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +45 -60
  30. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +2 -0
  31. databricks/labs/lakebridge/transpiler/transpile_engine.py +0 -18
  32. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/METADATA +1 -1
  33. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/RECORD +37 -28
  34. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/WHEEL +0 -0
  35. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/entry_points.txt +0 -0
  36. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/LICENSE +0 -0
  37. {databricks_labs_lakebridge-0.10.6.dist-info → databricks_labs_lakebridge-0.10.7.dist-info}/licenses/NOTICE +0 -0
@@ -1,40 +1,41 @@
1
1
  import re
2
2
  import abc
3
3
  import dataclasses
4
- import shutil
5
- from json import loads, dump
6
4
  import logging
7
5
  import os
8
- from shutil import rmtree, move
9
- from subprocess import run, CalledProcessError
6
+ import shutil
10
7
  import sys
11
- from typing import Any, cast
12
- from urllib import request
13
- from urllib.error import URLError, HTTPError
8
+ import venv
14
9
  import webbrowser
10
+ import xml.etree.ElementTree as ET
15
11
  from datetime import datetime, timezone
12
+ from json import loads, dump
16
13
  from pathlib import Path
17
- import xml.etree.ElementTree as ET
14
+ from shutil import rmtree, move
15
+ from subprocess import run, CalledProcessError
16
+ from typing import Any, Literal, cast
17
+ from urllib import request
18
+ from urllib.error import URLError, HTTPError
18
19
  from zipfile import ZipFile
19
20
 
20
- from databricks.labs.blueprint.installation import Installation, JsonValue
21
- from databricks.labs.blueprint.installation import SerdeError
21
+ from databricks.labs.blueprint.installation import Installation, JsonValue, SerdeError
22
22
  from databricks.labs.blueprint.installer import InstallState
23
23
  from databricks.labs.blueprint.tui import Prompts
24
24
  from databricks.labs.blueprint.wheels import ProductInfo
25
25
  from databricks.sdk import WorkspaceClient
26
26
  from databricks.sdk.errors import NotFound, PermissionDenied
27
27
 
28
+ from databricks.labs.lakebridge.__about__ import __version__
28
29
  from databricks.labs.lakebridge.config import (
29
- TranspileConfig,
30
- ReconcileConfig,
31
30
  DatabaseConfig,
31
+ ReconcileConfig,
32
32
  LakebridgeConfiguration,
33
33
  ReconcileMetadataConfig,
34
+ TranspileConfig,
34
35
  )
36
+ from databricks.labs.lakebridge.contexts.application import ApplicationContext
35
37
  from databricks.labs.lakebridge.deployment.configurator import ResourceConfigurator
36
38
  from databricks.labs.lakebridge.deployment.installation import WorkspaceInstallation
37
- from databricks.labs.lakebridge.helpers.file_utils import chdir
38
39
  from databricks.labs.lakebridge.reconcile.constants import ReconReportType, ReconSourceType
39
40
  from databricks.labs.lakebridge.transpiler.repository import TranspilerRepository
40
41
 
@@ -43,9 +44,70 @@ logger = logging.getLogger(__name__)
43
44
  TRANSPILER_WAREHOUSE_PREFIX = "Lakebridge Transpiler Validation"
44
45
 
45
46
 
47
+ class _PathBackup:
48
+ """A context manager to preserve a path before performing an operation, and optionally restore it afterwards."""
49
+
50
+ def __init__(self, path: Path) -> None:
51
+ self._path = path
52
+ self._backup_path: Path | None = None
53
+ self._finished = False
54
+
55
+ def __enter__(self) -> "_PathBackup":
56
+ self.start()
57
+ return self
58
+
59
+ def start(self) -> None:
60
+ """Start the backup process by creating a backup of the path, if it already exists."""
61
+ backup_path = self._path.with_name(f"{self._path.name}-saved")
62
+ if backup_path.exists():
63
+ logger.debug(f"Existing backup found, removing: {backup_path}")
64
+ rmtree(backup_path)
65
+ if self._path.exists():
66
+ logger.debug(f"Backing up existing path: {self._path} -> {backup_path}")
67
+ os.rename(self._path, backup_path)
68
+ self._backup_path = backup_path
69
+ else:
70
+ self._backup_path = None
71
+
72
+ def rollback(self) -> None:
73
+ """Rollback the operation by restoring the backup path, if it exists."""
74
+ assert not self._finished, "Can only rollback/commit once."
75
+ logger.debug(f"Removing path: {self._path}")
76
+ rmtree(self._path)
77
+ if self._backup_path is not None:
78
+ logger.debug(f"Restoring previous path: {self._backup_path} -> {self._path}")
79
+ os.rename(self._backup_path, self._path)
80
+ self._backup_path = None
81
+ self._finished = True
82
+
83
+ def commit(self) -> None:
84
+ """Commit the operation by removing the backup path, if it exists."""
85
+ assert not self._finished, "Can only rollback/commit once."
86
+ if self._backup_path is not None:
87
+ logger.debug(f"Removing backup path: {self._backup_path}")
88
+ rmtree(self._backup_path)
89
+ self._backup_path = None
90
+ self._finished = True
91
+
92
+ def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]:
93
+ if not self._finished:
94
+ # Automatically commit or rollback based on whether an exception is underway.
95
+ if exc_val is None:
96
+ self.commit()
97
+ else:
98
+ self.rollback()
99
+ return False # Do not suppress any exception underway
100
+
101
+
46
102
  class TranspilerInstaller(abc.ABC):
47
- def __init__(self, repository: TranspilerRepository) -> None:
103
+
104
+ # TODO: Remove these properties when post-install is removed.
105
+ _install_path: Path
106
+ """The path where the transpiler is being installed, once this starts."""
107
+
108
+ def __init__(self, repository: TranspilerRepository, product_name: str) -> None:
48
109
  self._repository = repository
110
+ self._product_name = product_name
49
111
 
50
112
  _version_pattern = re.compile(r"[_-](\d+(?:[.\-_]\w*\d+)+)")
51
113
 
@@ -75,9 +137,44 @@ class TranspilerInstaller(abc.ABC):
75
137
  dump(version_data, f)
76
138
  f.write("\n")
77
139
 
140
+ def _install_version_with_backup(self, version: str) -> Path | None:
141
+ """Install a specific version of the transpiler, with backup handling."""
142
+ logger.info(f"Installing Databricks {self._product_name} transpiler (v{version})")
143
+ product_path = self._repository.transpilers_path() / self._product_name
144
+ with _PathBackup(product_path) as backup:
145
+ self._install_path = product_path / "lib"
146
+ self._install_path.mkdir(parents=True, exist_ok=True)
147
+ try:
148
+ result = self._install_version(version)
149
+ except (CalledProcessError, KeyError, ValueError) as e:
150
+ # Warning: if you end up here under the IntelliJ/PyCharm debugger, it can be because the debugger is
151
+ # trying to inject itself into the subprocess. Try disabling:
152
+ # Settings | Build, Execution, Deployment | Python Debugger | Attach to subprocess automatically while debugging
153
+ # Note: Subprocess output is not captured, and should already be visible in the console.
154
+ logger.error(f"Failed to install {self._product_name} transpiler (v{version})", exc_info=e)
155
+ result = False
156
+
157
+ if result:
158
+ logger.info(f"Successfully installed {self._product_name} transpiler (v{version})")
159
+ self._store_product_state(product_path=product_path, version=version)
160
+ backup.commit()
161
+ return product_path
162
+ backup.rollback()
163
+ return None
164
+
165
+ @abc.abstractmethod
166
+ def _install_version(self, version: str) -> bool:
167
+ """Install a specific version of the transpiler, returning True if successful."""
168
+
78
169
 
79
170
  class WheelInstaller(TranspilerInstaller):
80
171
 
172
+ _venv_exec_cmd: Path
173
+ """Once created, the command to run the virtual environment's Python executable."""
174
+
175
+ _site_packages: Path
176
+ """Once created, the path to the site-packages directory in the virtual environment."""
177
+
81
178
  @classmethod
82
179
  def get_latest_artifact_version_from_pypi(cls, product_name: str) -> str | None:
83
180
  try:
@@ -96,8 +193,7 @@ class WheelInstaller(TranspilerInstaller):
96
193
  pypi_name: str,
97
194
  artifact: Path | None = None,
98
195
  ) -> None:
99
- super().__init__(repository)
100
- self._product_name = product_name
196
+ super().__init__(repository, product_name)
101
197
  self._pypi_name = pypi_name
102
198
  self._artifact = artifact
103
199
 
@@ -118,116 +214,45 @@ class WheelInstaller(TranspilerInstaller):
118
214
  if installed_version == latest_version:
119
215
  logger.info(f"{self._pypi_name} v{latest_version} already installed")
120
216
  return None
121
- return self._install_latest_version(latest_version)
122
-
123
- def _install_latest_version(self, version: str) -> Path | None:
124
- logger.info(f"Installing Databricks {self._product_name} transpiler v{version}")
125
- self._product_path = self._repository.transpilers_path() / self._product_name
126
- backup_path = Path(f"{self._product_path!s}-saved")
127
- if self._product_path.exists():
128
- os.rename(self._product_path, backup_path)
129
- self._install_path = self._product_path / "lib"
130
- self._install_path.mkdir(parents=True, exist_ok=True)
131
- try:
132
- result = self._unsafe_install_latest_version(version)
133
- logger.info(f"Successfully installed {self._pypi_name} v{version}")
134
- if backup_path.exists():
135
- rmtree(backup_path)
136
- return result
137
- except (CalledProcessError, ValueError) as e:
138
- logger.error(f"Failed to install {self._pypi_name} v{version}", exc_info=e)
139
- rmtree(self._product_path)
140
- if backup_path.exists():
141
- os.rename(backup_path, self._product_path)
142
- return None
217
+ return self._install_version_with_backup(latest_version)
143
218
 
144
- def _unsafe_install_latest_version(self, version: str) -> Path | None:
219
+ def _install_version(self, version: str) -> bool:
145
220
  self._create_venv()
146
221
  self._install_with_pip()
147
222
  self._copy_lsp_resources()
148
- return self._post_install(version)
223
+ return self._post_install() is not None
149
224
 
150
225
  def _create_venv(self) -> None:
151
- with chdir(self._install_path):
152
- self._unsafe_create_venv()
153
-
154
- def _unsafe_create_venv(self) -> None:
155
- # using the venv module doesn't work (maybe it's not possible to create a venv from a venv ?)
156
- # so falling back to something that works
157
- # for some reason this requires shell=True, so pass full cmd line
158
- cmd_line = f"{sys.executable} -m venv .venv"
159
- completed = run(cmd_line, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, shell=True, check=False)
160
- if completed.returncode:
161
- logger.error(f"Failed to create venv, error code: {completed.returncode}")
162
- if completed.stdout:
163
- for line in completed.stdout:
164
- logger.error(line)
165
- if completed.stderr:
166
- for line in completed.stderr:
167
- logger.error(line)
168
- completed.check_returncode()
169
- self._venv = self._install_path / ".venv"
170
- self._site_packages = self._locate_site_packages()
171
-
172
- def _locate_site_packages(self) -> Path:
173
- # can't use sysconfig because it only works for currently running python
174
- if sys.platform == "win32":
175
- return self._locate_site_packages_windows()
176
- return self._locate_site_packages_linux_or_macos()
177
-
178
- def _locate_site_packages_windows(self) -> Path:
179
- packages = self._venv / "Lib" / "site-packages"
180
- if packages.exists():
181
- return packages
182
- raise ValueError(f"Could not locate 'site-packages' for {self._venv!s}")
183
-
184
- def _locate_site_packages_linux_or_macos(self) -> Path:
185
- lib = self._venv / "lib"
186
- for dir_ in os.listdir(lib):
187
- if dir_.startswith("python"):
188
- packages = lib / dir_ / "site-packages"
189
- if packages.exists():
190
- return packages
191
- raise ValueError(f"Could not locate 'site-packages' for {self._venv!s}")
192
-
193
- def _install_with_pip(self) -> None:
194
- with chdir(self._install_path):
195
- # the way to call pip from python is highly sensitive to os and source type
196
- if self._artifact:
197
- self._install_local_artifact()
198
- else:
199
- self._install_remote_artifact()
200
-
201
- def _install_local_artifact(self) -> None:
202
- pip = self._locate_pip()
203
- pip = pip.relative_to(self._install_path)
204
- target = self._site_packages
205
- target = target.relative_to(self._install_path)
206
- if sys.platform == "win32":
207
- command = f"{pip!s} install {self._artifact!s} -t {target!s}"
208
- completed = run(command, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, shell=False, check=False)
226
+ venv_path = self._install_path / ".venv"
227
+ # Sadly, some platform-specific variations need to be dealt with:
228
+ # - Windows venvs do not use symlinks, but rather copies, when populating the venv.
229
+ # - The library path is different.
230
+ if use_symlinks := sys.platform != "win32":
231
+ major, minor = sys.version_info[:2]
232
+ lib_path = venv_path / "lib" / f"python{major}.{minor}" / "site-packages"
209
233
  else:
210
- command = f"'{pip!s}' install '{self._artifact!s}' -t '{target!s}'"
211
- completed = run(command, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, shell=True, check=False)
212
- # checking return code later makes debugging easier
213
- completed.check_returncode()
214
-
215
- def _install_remote_artifact(self) -> None:
216
- pip = self._locate_pip()
217
- pip = pip.relative_to(self._install_path)
218
- target = self._site_packages
219
- target = target.relative_to(self._install_path)
220
- if sys.platform == "win32":
221
- args = [str(pip), "install", self._pypi_name, "-t", str(target)]
222
- completed = run(args, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, shell=False, check=False)
223
- else:
224
- command = f"'{pip!s}' install {self._pypi_name} -t '{target!s}'"
225
- completed = run(command, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, shell=True, check=False)
226
- # checking return code later makes debugging easier
227
- completed.check_returncode()
234
+ lib_path = venv_path / "Lib" / "site-packages"
235
+ builder = venv.EnvBuilder(with_pip=True, prompt=f"{self._product_name}", symlinks=use_symlinks)
236
+ builder.create(venv_path)
237
+ context = builder.ensure_directories(venv_path)
238
+ logger.debug(f"Created virtual environment with context: {context}")
239
+ self._venv_exec_cmd = context.env_exec_cmd
240
+ self._site_packages = lib_path
228
241
 
229
- def _locate_pip(self) -> Path:
230
- return self._venv / "Scripts" / "pip3.exe" if sys.platform == "win32" else self._venv / "bin" / "pip3"
242
+ def _install_with_pip(self) -> None:
243
+ # Based on: https://pip.pypa.io/en/stable/user_guide/#using-pip-from-your-program
244
+ # (But with venv_exec_cmd instead of sys.executable, so that we use the venv's pip.)
245
+ to_install: Path | str = self._artifact if self._artifact is not None else self._pypi_name
246
+ command: list[Path | str] = [
247
+ self._venv_exec_cmd,
248
+ "-m",
249
+ "pip",
250
+ "--disable-pip-version-check",
251
+ "install",
252
+ to_install,
253
+ ]
254
+ result = run(command, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, check=False)
255
+ result.check_returncode()
231
256
 
232
257
  def _copy_lsp_resources(self):
233
258
  lsp = self._site_packages / "lsp"
@@ -235,21 +260,20 @@ class WheelInstaller(TranspilerInstaller):
235
260
  raise ValueError("Installed transpiler is missing a 'lsp' folder")
236
261
  shutil.copytree(lsp, self._install_path, dirs_exist_ok=True)
237
262
 
238
- def _post_install(self, version: str) -> Path | None:
263
+ def _post_install(self) -> Path | None:
239
264
  config = self._install_path / "config.yml"
240
265
  if not config.exists():
241
266
  raise ValueError("Installed transpiler is missing a 'config.yml' file in its 'lsp' folder")
242
267
  install_ext = "ps1" if sys.platform == "win32" else "sh"
243
268
  install_script = f"installer.{install_ext}"
244
- installer = self._install_path / install_script
245
- if installer.exists():
246
- self._run_custom_installer(installer)
247
- self._store_product_state(product_path=self._product_path, version=version)
269
+ installer_path = self._install_path / install_script
270
+ if installer_path.exists():
271
+ self._run_custom_installer(installer_path)
248
272
  return self._install_path
249
273
 
250
- def _run_custom_installer(self, installer):
251
- args = [str(installer)]
252
- run(args, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, cwd=str(self._install_path), check=True)
274
+ def _run_custom_installer(self, installer_path: Path) -> None:
275
+ args = [installer_path]
276
+ run(args, stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr, cwd=self._install_path, check=True)
253
277
 
254
278
 
255
279
  class MavenInstaller(TranspilerInstaller):
@@ -336,8 +360,7 @@ class MavenInstaller(TranspilerInstaller):
336
360
  artifact_id: str,
337
361
  artifact: Path | None = None,
338
362
  ) -> None:
339
- super().__init__(repository)
340
- self._product_name = product_name
363
+ super().__init__(repository, product_name)
341
364
  self._group_id = group_id
342
365
  self._artifact_id = artifact_id
343
366
  self._artifact = artifact
@@ -358,40 +381,15 @@ class MavenInstaller(TranspilerInstaller):
358
381
  if installed_version == latest_version:
359
382
  logger.info(f"Databricks {self._product_name} transpiler v{latest_version} already installed")
360
383
  return None
361
- return self._install_version(latest_version)
384
+ return self._install_version_with_backup(latest_version)
362
385
 
363
- def _install_version(self, version: str) -> Path | None:
364
- logger.info(f"Installing Databricks {self._product_name} transpiler v{version}")
365
- self._product_path = self._repository.transpilers_path() / self._product_name
366
- backup_path = Path(f"{self._product_path!s}-saved")
367
- if backup_path.exists():
368
- rmtree(backup_path)
369
- if self._product_path.exists():
370
- os.rename(self._product_path, backup_path)
371
- self._product_path.mkdir(parents=True)
372
- self._install_path = self._product_path / "lib"
373
- self._install_path.mkdir()
374
- try:
375
- if self._unsafe_install_version(version):
376
- logger.info(f"Successfully installed {self._product_name} v{version}")
377
- self._store_product_state(self._product_path, version)
378
- if backup_path.exists():
379
- rmtree(backup_path)
380
- return self._product_path
381
- except (KeyError, ValueError) as e:
382
- logger.error(f"Failed to install Databricks {self._product_name} transpiler v{version}", exc_info=e)
383
- rmtree(self._product_path)
384
- if backup_path.exists():
385
- os.rename(backup_path, self._product_path)
386
- return None
387
-
388
- def _unsafe_install_version(self, version: str) -> bool:
386
+ def _install_version(self, version: str) -> bool:
389
387
  jar_file_path = self._install_path / f"{self._artifact_id}.jar"
390
388
  if self._artifact:
391
- logger.debug(f"Copying '{self._artifact!s}' to '{jar_file_path!s}'")
389
+ logger.debug(f"Copying: {self._artifact} -> {jar_file_path}")
392
390
  shutil.copyfile(self._artifact, jar_file_path)
393
391
  elif not self.download_artifact_from_maven(self._group_id, self._artifact_id, version, jar_file_path):
394
- logger.error(f"Failed to install Databricks {self._product_name} transpiler v{version}")
392
+ logger.error(f"Failed to install Databricks {self._product_name} transpiler (v{version})")
395
393
  return False
396
394
  self._copy_lsp_config(jar_file_path)
397
395
  return True
@@ -449,6 +447,13 @@ class WorkspaceInstaller:
449
447
  logger.info("Installation completed successfully! Please refer to the documentation for the next steps.")
450
448
  return config
451
449
 
450
+ def has_installed_transpilers(self) -> bool:
451
+ """Detect whether there are transpilers currently installed."""
452
+ installed_transpilers = self._transpiler_repository.all_transpiler_names()
453
+ if installed_transpilers:
454
+ logger.info(f"Detected installed transpilers: {sorted(installed_transpilers)}")
455
+ return bool(installed_transpilers)
456
+
452
457
  def install_bladebridge(self, artifact: Path | None = None) -> None:
453
458
  local_name = "bladebridge"
454
459
  pypi_name = "databricks-bb-plugin"
@@ -802,3 +807,28 @@ class WorkspaceInstaller:
802
807
 
803
808
  def _has_necessary_access(self, catalog_name: str, schema_name: str, volume_name: str | None = None):
804
809
  self._resource_configurator.has_necessary_access(catalog_name, schema_name, volume_name)
810
+
811
+
812
+ def installer(ws: WorkspaceClient, transpiler_repository: TranspilerRepository) -> WorkspaceInstaller:
813
+ app_context = ApplicationContext(_verify_workspace_client(ws))
814
+ return WorkspaceInstaller(
815
+ app_context.workspace_client,
816
+ app_context.prompts,
817
+ app_context.installation,
818
+ app_context.install_state,
819
+ app_context.product_info,
820
+ app_context.resource_configurator,
821
+ app_context.workspace_installation,
822
+ transpiler_repository=transpiler_repository,
823
+ )
824
+
825
+
826
+ def _verify_workspace_client(ws: WorkspaceClient) -> WorkspaceClient:
827
+ """Verifies the workspace client configuration, ensuring it has the correct product info."""
828
+
829
+ # Using reflection to set right value for _product_info for telemetry
830
+ product_info = getattr(ws.config, '_product_info')
831
+ if product_info[0] != "lakebridge":
832
+ setattr(ws.config, '_product_info', ('lakebridge', __version__))
833
+
834
+ return ws
@@ -3,6 +3,7 @@ from functools import reduce
3
3
  from pyspark.sql import DataFrame, SparkSession
4
4
  from pyspark.sql.functions import col, expr, lit
5
5
 
6
+ from databricks.labs.lakebridge.reconcile.connectors.dialect_utils import DialectUtils
6
7
  from databricks.labs.lakebridge.reconcile.exception import ColumnMismatchException
7
8
  from databricks.labs.lakebridge.reconcile.recon_capture import (
8
9
  ReconIntermediatePersist,
@@ -22,7 +23,7 @@ _HASH_COLUMN_NAME = "hash_value_recon"
22
23
  _SAMPLE_ROWS = 50
23
24
 
24
25
 
25
- def raise_column_mismatch_exception(msg: str, source_missing: list[str], target_missing: list[str]) -> Exception:
26
+ def _raise_column_mismatch_exception(msg: str, source_missing: list[str], target_missing: list[str]) -> Exception:
26
27
  error_msg = (
27
28
  f"{msg}\n"
28
29
  f"columns missing in source: {','.join(source_missing) if source_missing else None}\n"
@@ -33,12 +34,25 @@ def raise_column_mismatch_exception(msg: str, source_missing: list[str], target_
33
34
 
34
35
  def _generate_join_condition(source_alias, target_alias, key_columns):
35
36
  conditions = [
36
- col(f"{source_alias}.{key_column}").eqNullSafe(col(f"{target_alias}.{key_column}"))
37
+ col(f"{source_alias}.{DialectUtils.ansi_normalize_identifier(key_column)}").eqNullSafe(
38
+ col(f"{target_alias}.{DialectUtils.ansi_normalize_identifier(key_column)}")
39
+ )
37
40
  for key_column in key_columns
38
41
  ]
39
42
  return reduce(lambda a, b: a & b, conditions)
40
43
 
41
44
 
45
+ def _build_column_selector(table_name, column_name):
46
+ alias = DialectUtils.ansi_normalize_identifier(f"{table_name}_{DialectUtils.unnormalize_identifier(column_name)}")
47
+ return f'{table_name}.{DialectUtils.ansi_normalize_identifier(column_name)} as {alias}'
48
+
49
+
50
+ def _build_mismatch_column(table, column):
51
+ return col(DialectUtils.ansi_normalize_identifier(column)).alias(
52
+ DialectUtils.unnormalize_identifier(column.replace(f'{table}_', '').lower())
53
+ )
54
+
55
+
42
56
  def reconcile_data(
43
57
  source: DataFrame,
44
58
  target: DataFrame,
@@ -59,14 +73,14 @@ def reconcile_data(
59
73
  how="full",
60
74
  )
61
75
  .selectExpr(
62
- *[f'{source_alias}.{col_name} as {source_alias}_{col_name}' for col_name in source.columns],
63
- *[f'{target_alias}.{col_name} as {target_alias}_{col_name}' for col_name in target.columns],
76
+ *[f'{_build_column_selector(source_alias, col_name)}' for col_name in source.columns],
77
+ *[f'{_build_column_selector(target_alias, col_name)}' for col_name in target.columns],
64
78
  )
65
79
  )
66
80
 
67
81
  # Write unmatched df to volume
68
82
  df = ReconIntermediatePersist(spark, path).write_and_read_unmatched_df_with_volumes(df)
69
- logger.warning(f"Unmatched data is written to {path} successfully")
83
+ logger.warning(f"Unmatched data was written to {path} successfully")
70
84
 
71
85
  mismatch = _get_mismatch_data(df, source_alias, target_alias) if report_type in {"all", "data"} else None
72
86
 
@@ -74,24 +88,24 @@ def reconcile_data(
74
88
  df.filter(col(f"{source_alias}_{_HASH_COLUMN_NAME}").isNull())
75
89
  .select(
76
90
  *[
77
- col(col_name).alias(col_name.replace(f'{target_alias}_', '').lower())
91
+ _build_mismatch_column(target_alias, col_name)
78
92
  for col_name in df.columns
79
93
  if col_name.startswith(f'{target_alias}_')
80
94
  ]
81
95
  )
82
- .drop(_HASH_COLUMN_NAME)
96
+ .drop(f"{_HASH_COLUMN_NAME}")
83
97
  )
84
98
 
85
99
  missing_in_tgt = (
86
100
  df.filter(col(f"{target_alias}_{_HASH_COLUMN_NAME}").isNull())
87
101
  .select(
88
102
  *[
89
- col(col_name).alias(col_name.replace(f'{source_alias}_', '').lower())
103
+ _build_mismatch_column(source_alias, col_name)
90
104
  for col_name in df.columns
91
105
  if col_name.startswith(f'{source_alias}_')
92
106
  ]
93
107
  )
94
- .drop(_HASH_COLUMN_NAME)
108
+ .drop(f"{_HASH_COLUMN_NAME}")
95
109
  )
96
110
  mismatch_count = 0
97
111
  if mismatch:
@@ -123,23 +137,27 @@ def _get_mismatch_data(df: DataFrame, src_alias: str, tgt_alias: str) -> DataFra
123
137
  .filter(col("hash_match") == lit(False))
124
138
  .select(
125
139
  *[
126
- col(col_name).alias(col_name.replace(f'{src_alias}_', '').lower())
140
+ _build_mismatch_column(src_alias, col_name)
127
141
  for col_name in df.columns
128
142
  if col_name.startswith(f'{src_alias}_')
129
143
  ]
130
144
  )
131
- .drop(_HASH_COLUMN_NAME)
145
+ .drop(f"{_HASH_COLUMN_NAME}")
132
146
  )
133
147
 
134
148
 
135
- def _convert_columns_to_lowercase(df: DataFrame) -> DataFrame:
136
- lowercased_columns = [col(column).alias(column.lower()) for column in df.columns]
137
- return df.select(*lowercased_columns)
149
+ def _build_capture_df(df: DataFrame) -> DataFrame:
150
+ columns = [
151
+ col(DialectUtils.ansi_normalize_identifier(column)).alias(DialectUtils.unnormalize_identifier(column))
152
+ for column in df.columns
153
+ ]
154
+ return df.select(*columns)
138
155
 
139
156
 
140
157
  def capture_mismatch_data_and_columns(source: DataFrame, target: DataFrame, key_columns: list[str]) -> MismatchOutput:
141
- source_df = _convert_columns_to_lowercase(source)
142
- target_df = _convert_columns_to_lowercase(target)
158
+ source_df = _build_capture_df(source)
159
+ target_df = _build_capture_df(target)
160
+ unnormalized_key_columns = [DialectUtils.unnormalize_identifier(column) for column in key_columns]
143
161
 
144
162
  source_columns = source_df.columns
145
163
  target_columns = target_df.columns
@@ -148,10 +166,10 @@ def capture_mismatch_data_and_columns(source: DataFrame, target: DataFrame, key_
148
166
  message = "source and target should have same columns for capturing the mismatch data"
149
167
  source_missing = [column for column in target_columns if column not in source_columns]
150
168
  target_missing = [column for column in source_columns if column not in target_columns]
151
- raise raise_column_mismatch_exception(message, source_missing, target_missing)
169
+ raise _raise_column_mismatch_exception(message, source_missing, target_missing)
152
170
 
153
- check_columns = [column for column in source_columns if column not in key_columns]
154
- mismatch_df = _get_mismatch_df(source_df, target_df, key_columns, check_columns)
171
+ check_columns = [column for column in source_columns if column not in unnormalized_key_columns]
172
+ mismatch_df = _get_mismatch_df(source_df, target_df, unnormalized_key_columns, check_columns)
155
173
  mismatch_columns = _get_mismatch_columns(mismatch_df, check_columns)
156
174
  return MismatchOutput(mismatch_df, mismatch_columns)
157
175
 
@@ -167,31 +185,50 @@ def _get_mismatch_columns(df: DataFrame, columns: list[str]):
167
185
  return mismatch_columns
168
186
 
169
187
 
188
+ def _normalize_mismatch_df_col(column, suffix):
189
+ unnormalized = DialectUtils.unnormalize_identifier(column) + suffix
190
+ return DialectUtils.ansi_normalize_identifier(unnormalized)
191
+
192
+
193
+ def _unnormalize_mismatch_df_col(column, suffix):
194
+ unnormalized = DialectUtils.unnormalize_identifier(column) + suffix
195
+ return unnormalized
196
+
197
+
170
198
  def _get_mismatch_df(source: DataFrame, target: DataFrame, key_columns: list[str], column_list: list[str]):
171
- source_aliased = [col('base.' + column).alias(column + '_base') for column in column_list]
172
- target_aliased = [col('compare.' + column).alias(column + '_compare') for column in column_list]
199
+ source_aliased = [
200
+ col('base.' + DialectUtils.ansi_normalize_identifier(column)).alias(
201
+ _unnormalize_mismatch_df_col(column, '_base')
202
+ )
203
+ for column in column_list
204
+ ]
205
+ target_aliased = [
206
+ col('compare.' + DialectUtils.ansi_normalize_identifier(column)).alias(
207
+ _unnormalize_mismatch_df_col(column, '_compare')
208
+ )
209
+ for column in column_list
210
+ ]
173
211
 
174
- match_expr = [expr(f"{column}_base=={column}_compare").alias(column + "_match") for column in column_list]
175
- key_cols = [col(column) for column in key_columns]
212
+ match_expr = [
213
+ expr(f"{_normalize_mismatch_df_col(column,'_base')}=={_normalize_mismatch_df_col(column,'_compare')}").alias(
214
+ _unnormalize_mismatch_df_col(column, '_match')
215
+ )
216
+ for column in column_list
217
+ ]
218
+ key_cols = [col(DialectUtils.ansi_normalize_identifier(column)) for column in key_columns]
176
219
  select_expr = key_cols + source_aliased + target_aliased + match_expr
177
220
 
178
- filter_columns = " and ".join([column + "_match" for column in column_list])
179
- filter_expr = ~expr(filter_columns)
180
-
181
221
  logger.info(f"KEY COLUMNS: {key_columns}")
182
- logger.info(f"FILTER COLUMNS: {filter_expr}")
183
222
  logger.info(f"SELECT COLUMNS: {select_expr}")
184
223
 
185
224
  mismatch_df = (
186
225
  source.alias('base').join(other=target.alias('compare'), on=key_columns, how="inner").select(*select_expr)
187
226
  )
188
227
 
189
- compare_columns = [column for column in mismatch_df.columns if column not in key_columns]
190
- return mismatch_df.select(*key_columns + sorted(compare_columns))
191
-
192
-
193
- def alias_column_str(alias: str, columns: list[str]) -> list[str]:
194
- return [f"{alias}.{column}" for column in columns]
228
+ compare_columns = [
229
+ DialectUtils.ansi_normalize_identifier(column) for column in mismatch_df.columns if column not in key_columns
230
+ ]
231
+ return mismatch_df.select(*key_cols + sorted(compare_columns))
195
232
 
196
233
 
197
234
  def _generate_agg_join_condition(source_alias: str, target_alias: str, key_columns: list[str]):