databricks-labs-lakebridge 0.10.7__py3-none-any.whl → 0.10.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. databricks/labs/lakebridge/__about__.py +1 -1
  2. databricks/labs/lakebridge/assessments/profiler_validator.py +103 -0
  3. databricks/labs/lakebridge/base_install.py +1 -5
  4. databricks/labs/lakebridge/cli.py +13 -6
  5. databricks/labs/lakebridge/helpers/validation.py +5 -3
  6. databricks/labs/lakebridge/install.py +40 -481
  7. databricks/labs/lakebridge/reconcile/connectors/data_source.py +9 -5
  8. databricks/labs/lakebridge/reconcile/connectors/databricks.py +2 -1
  9. databricks/labs/lakebridge/reconcile/connectors/oracle.py +2 -1
  10. databricks/labs/lakebridge/reconcile/connectors/secrets.py +19 -1
  11. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +50 -29
  12. databricks/labs/lakebridge/reconcile/connectors/tsql.py +2 -1
  13. databricks/labs/lakebridge/reconcile/query_builder/base.py +50 -11
  14. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +8 -2
  15. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +7 -13
  16. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +18 -19
  17. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +36 -15
  18. databricks/labs/lakebridge/reconcile/recon_config.py +0 -15
  19. databricks/labs/lakebridge/reconcile/reconciliation.py +4 -1
  20. databricks/labs/lakebridge/reconcile/trigger_recon_aggregate_service.py +11 -31
  21. databricks/labs/lakebridge/reconcile/trigger_recon_service.py +4 -1
  22. databricks/labs/lakebridge/transpiler/execute.py +34 -28
  23. databricks/labs/lakebridge/transpiler/installers.py +523 -0
  24. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +2 -0
  25. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/METADATA +1 -1
  26. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/RECORD +30 -28
  27. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/WHEEL +0 -0
  28. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/entry_points.txt +0 -0
  29. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/LICENSE +0 -0
  30. {databricks_labs_lakebridge-0.10.7.dist-info → databricks_labs_lakebridge-0.10.8.dist-info}/licenses/NOTICE +0 -0
@@ -1,2 +1,2 @@
1
1
  # DO NOT MODIFY THIS FILE
2
- __version__ = "0.10.7"
2
+ __version__ = "0.10.8"
@@ -0,0 +1,103 @@
1
+ import os
2
+ from dataclasses import dataclass
3
+ from duckdb import DuckDBPyConnection
4
+
5
+ from databricks.labs.lakebridge.assessments.pipeline import PipelineClass
6
+
7
+ PROFILER_DB_NAME = "profiler_extract.db"
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class ValidationOutcome:
12
+ """A data class that holds the outcome of a table validation check."""
13
+
14
+ table: str
15
+ column: str | None
16
+ strategy: str
17
+ outcome: str
18
+ severity: str
19
+
20
+
21
+ class ValidationStrategy:
22
+ """Abstract class for validating a Profiler table"""
23
+
24
+ def validate(self, connection: DuckDBPyConnection) -> ValidationOutcome:
25
+ raise NotImplementedError
26
+
27
+
28
+ class NullValidationCheck(ValidationStrategy):
29
+ """Concrete class for validating null values in a profiler table"""
30
+
31
+ def __init__(self, table, column, severity="WARN"):
32
+ self.name = self.__class__.__name__
33
+ self.table = table
34
+ self.column = column
35
+ self.severity = severity
36
+
37
+ def validate(self, connection: DuckDBPyConnection) -> ValidationOutcome:
38
+ """
39
+ Validates that a column does not contain null values.
40
+ input:
41
+ connection: a DuckDB connection object
42
+ """
43
+ result = connection.execute(f"SELECT COUNT(*) FROM {self.table} WHERE {self.column} IS NULL").fetchone()
44
+ if result:
45
+ row_count = result[0]
46
+ outcome = "FAIL" if row_count > 0 else "PASS"
47
+ else:
48
+ outcome = "FAIL"
49
+ return ValidationOutcome(self.table, self.column, self.name, outcome, self.severity)
50
+
51
+
52
+ class EmptyTableValidationCheck(ValidationStrategy):
53
+ """Concrete class for validating empty tables from a profiler run."""
54
+
55
+ def __init__(self, table, severity="WARN"):
56
+ self.name = self.__class__.__name__
57
+ self.table = table
58
+ self.severity = severity
59
+
60
+ def validate(self, connection) -> ValidationOutcome:
61
+ """Validates that a table is not empty.
62
+ input:
63
+ connection: a DuckDB connection object
64
+ returns:
65
+ a ValidationOutcome object
66
+ """
67
+ result = connection.execute(f"SELECT COUNT(*) FROM {self.table}").fetchone()
68
+ if result:
69
+ row_count = result[0]
70
+ outcome = "PASS" if row_count > 0 else "FAIL"
71
+ else:
72
+ outcome = "FAIL"
73
+ return ValidationOutcome(self.table, None, self.name, outcome, self.severity)
74
+
75
+
76
+ def get_profiler_extract_path(pipeline_config_path: str) -> str:
77
+ """
78
+ Returns the filesystem path of the profiler extract database.
79
+ input:
80
+ pipeline_config_path: the location of the pipeline definition .yml file
81
+ returns:
82
+ the filesystem path to the profiler extract database
83
+ """
84
+ pipeline_config = PipelineClass.load_config_from_yaml(pipeline_config_path)
85
+ normalized_db_path = os.path.normpath(pipeline_config.extract_folder)
86
+ database_path = f"{normalized_db_path}/{PROFILER_DB_NAME}"
87
+ return database_path
88
+
89
+
90
+ def build_validation_report(
91
+ validations: list[ValidationStrategy], connection: DuckDBPyConnection
92
+ ) -> list[ValidationOutcome]:
93
+ """
94
+ Builds a list of ValidationOutcomes from list of validation checks.
95
+ input:
96
+ validations: a list of ValidationStrategy objects
97
+ connection: a DuckDB connection object
98
+ returns: a list of ValidationOutcomes
99
+ """
100
+ validation_report = []
101
+ for validation in validations:
102
+ validation_report.append(validation.validate(connection))
103
+ return validation_report
@@ -19,11 +19,7 @@ def main() -> None:
19
19
  WorkspaceClient(product="lakebridge", product_version=__version__),
20
20
  transpiler_repository=TranspilerRepository.user_home(),
21
21
  )
22
- if installer.has_installed_transpilers():
23
- logger.warning(
24
- "Detected existing Lakebridge transpilers; run 'databricks labs lakebridge install-transpile' to upgrade them."
25
- )
26
- else:
22
+ if not installer.upgrade_installed_transpilers():
27
23
  logger.debug("No existing Lakebridge transpilers detected; assuming fresh installation.")
28
24
 
29
25
  logger.info("Successfully Setup Lakebridge Components Locally")
@@ -74,6 +74,7 @@ def _remove_warehouse(ws: WorkspaceClient, warehouse_id: str):
74
74
 
75
75
  @lakebridge.command
76
76
  def transpile(
77
+ *,
77
78
  w: WorkspaceClient,
78
79
  transpiler_config_path: str | None = None,
79
80
  source_dialect: str | None = None,
@@ -340,6 +341,8 @@ class _TranspileConfigChecker:
340
341
  supported_dialects = ", ".join(self._transpiler_repository.all_dialects())
341
342
  msg = f"{msg_prefix}: {source_dialect!r} (supported dialects: {supported_dialects})"
342
343
  raise_validation_exception(msg)
344
+ else:
345
+ self._config = dataclasses.replace(self._config, source_dialect=source_dialect)
343
346
  else:
344
347
  # Check the source dialect against the engine.
345
348
  if source_dialect not in engine.supported_dialects:
@@ -366,6 +369,7 @@ class _TranspileConfigChecker:
366
369
  source_dialect = self._prompts.choice("Select the source dialect:", list(supported_dialects))
367
370
  engine = self._configure_transpiler_config_path(source_dialect)
368
371
  assert engine is not None, "No transpiler engine available for a supported dialect; configuration is invalid."
372
+ self._config = dataclasses.replace(self._config, source_dialect=source_dialect)
369
373
  return engine
370
374
 
371
375
  def _check_lsp_engine(self) -> TranspileEngine:
@@ -518,7 +522,7 @@ def _override_workspace_client_config(ctx: ApplicationContext, overrides: dict[s
518
522
 
519
523
 
520
524
  @lakebridge.command
521
- def reconcile(w: WorkspaceClient) -> None:
525
+ def reconcile(*, w: WorkspaceClient) -> None:
522
526
  """[EXPERIMENTAL] Reconciles source to Databricks datasets"""
523
527
  with_user_agent_extra("cmd", "execute-reconcile")
524
528
  ctx = ApplicationContext(w)
@@ -534,7 +538,7 @@ def reconcile(w: WorkspaceClient) -> None:
534
538
 
535
539
 
536
540
  @lakebridge.command
537
- def aggregates_reconcile(w: WorkspaceClient) -> None:
541
+ def aggregates_reconcile(*, w: WorkspaceClient) -> None:
538
542
  """[EXPERIMENTAL] Reconciles Aggregated source to Databricks datasets"""
539
543
  with_user_agent_extra("cmd", "execute-aggregates-reconcile")
540
544
  ctx = ApplicationContext(w)
@@ -552,8 +556,8 @@ def aggregates_reconcile(w: WorkspaceClient) -> None:
552
556
 
553
557
  @lakebridge.command
554
558
  def generate_lineage(
555
- w: WorkspaceClient,
556
559
  *,
560
+ w: WorkspaceClient,
557
561
  source_dialect: str | None = None,
558
562
  input_source: str,
559
563
  output_folder: str,
@@ -578,7 +582,7 @@ def generate_lineage(
578
582
 
579
583
 
580
584
  @lakebridge.command
581
- def configure_secrets(w: WorkspaceClient) -> None:
585
+ def configure_secrets(*, w: WorkspaceClient) -> None:
582
586
  """Setup reconciliation connection profile details as Secrets on Databricks Workspace"""
583
587
  recon_conf = ReconConfigPrompts(w)
584
588
 
@@ -604,8 +608,9 @@ def configure_database_profiler() -> None:
604
608
  assessment.run()
605
609
 
606
610
 
607
- @lakebridge.command()
611
+ @lakebridge.command
608
612
  def install_transpile(
613
+ *,
609
614
  w: WorkspaceClient,
610
615
  artifact: str | None = None,
611
616
  transpiler_repository: TranspilerRepository = TranspilerRepository.user_home(),
@@ -622,6 +627,7 @@ def install_transpile(
622
627
 
623
628
  @lakebridge.command(is_unauthenticated=False)
624
629
  def configure_reconcile(
630
+ *,
625
631
  w: WorkspaceClient,
626
632
  transpiler_repository: TranspilerRepository = TranspilerRepository.user_home(),
627
633
  ) -> None:
@@ -637,8 +643,9 @@ def configure_reconcile(
637
643
  reconcile_installer.run(module="reconcile")
638
644
 
639
645
 
640
- @lakebridge.command()
646
+ @lakebridge.command
641
647
  def analyze(
648
+ *,
642
649
  w: WorkspaceClient,
643
650
  source_directory: str | None = None,
644
651
  report_file: str | None = None,
@@ -37,19 +37,21 @@ class Validator:
37
37
  config.catalog_name,
38
38
  config.schema_name,
39
39
  )
40
+ # Some errors doesn't return the query test alon with the error message so need to handle those separately
41
+ static_errors_lkp = ["[UNRESOLVED_ROUTINE]", "[UNRESOLVED_COLUMN.WITHOUT_SUGGESTION]"]
40
42
  if is_valid:
41
43
  result = sql_text
42
44
  if exception_type is not None:
43
45
  exception_msg = f"[{exception_type.upper()}]: {exception_msg}"
44
46
  else:
45
47
  query = ""
46
- if "[UNRESOLVED_ROUTINE]" in str(exception_msg):
48
+ if any(err in str(exception_msg) for err in static_errors_lkp):
47
49
  query = sql_text
48
50
  buffer = StringIO()
49
51
  buffer.write("-------------- Exception Start-------------------\n")
50
- buffer.write("/* \n")
52
+ buffer.write("/*\n")
51
53
  buffer.write(str(exception_msg))
52
- buffer.write("\n */ \n")
54
+ buffer.write("\n*/\n")
53
55
  buffer.write(query)
54
56
  buffer.write("\n ---------------Exception End --------------------\n")
55
57