databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. databricks/__init__.py +3 -0
  2. databricks/labs/__init__.py +3 -0
  3. databricks/labs/lakebridge/__about__.py +2 -0
  4. databricks/labs/lakebridge/__init__.py +11 -0
  5. databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
  6. databricks/labs/lakebridge/assessments/pipeline.py +188 -0
  7. databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
  8. databricks/labs/lakebridge/base_install.py +12 -0
  9. databricks/labs/lakebridge/cli.py +449 -0
  10. databricks/labs/lakebridge/config.py +192 -0
  11. databricks/labs/lakebridge/connections/__init__.py +0 -0
  12. databricks/labs/lakebridge/connections/credential_manager.py +89 -0
  13. databricks/labs/lakebridge/connections/database_manager.py +98 -0
  14. databricks/labs/lakebridge/connections/env_getter.py +13 -0
  15. databricks/labs/lakebridge/contexts/__init__.py +0 -0
  16. databricks/labs/lakebridge/contexts/application.py +133 -0
  17. databricks/labs/lakebridge/coverage/__init__.py +0 -0
  18. databricks/labs/lakebridge/coverage/commons.py +223 -0
  19. databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
  20. databricks/labs/lakebridge/coverage/local_report.py +9 -0
  21. databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
  22. databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
  23. databricks/labs/lakebridge/deployment/__init__.py +0 -0
  24. databricks/labs/lakebridge/deployment/configurator.py +199 -0
  25. databricks/labs/lakebridge/deployment/dashboard.py +140 -0
  26. databricks/labs/lakebridge/deployment/installation.py +125 -0
  27. databricks/labs/lakebridge/deployment/job.py +147 -0
  28. databricks/labs/lakebridge/deployment/recon.py +145 -0
  29. databricks/labs/lakebridge/deployment/table.py +30 -0
  30. databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
  31. databricks/labs/lakebridge/discovery/table.py +36 -0
  32. databricks/labs/lakebridge/discovery/table_definition.py +23 -0
  33. databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
  34. databricks/labs/lakebridge/errors/exceptions.py +1 -0
  35. databricks/labs/lakebridge/helpers/__init__.py +0 -0
  36. databricks/labs/lakebridge/helpers/db_sql.py +24 -0
  37. databricks/labs/lakebridge/helpers/execution_time.py +20 -0
  38. databricks/labs/lakebridge/helpers/file_utils.py +64 -0
  39. databricks/labs/lakebridge/helpers/metastore.py +164 -0
  40. databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
  41. databricks/labs/lakebridge/helpers/string_utils.py +62 -0
  42. databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
  43. databricks/labs/lakebridge/helpers/validation.py +101 -0
  44. databricks/labs/lakebridge/install.py +849 -0
  45. databricks/labs/lakebridge/intermediate/__init__.py +0 -0
  46. databricks/labs/lakebridge/intermediate/dag.py +88 -0
  47. databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
  48. databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
  49. databricks/labs/lakebridge/jvmproxy.py +56 -0
  50. databricks/labs/lakebridge/lineage.py +42 -0
  51. databricks/labs/lakebridge/reconcile/__init__.py +0 -0
  52. databricks/labs/lakebridge/reconcile/compare.py +414 -0
  53. databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
  54. databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
  55. databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
  56. databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
  57. databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
  58. databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
  59. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
  60. databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
  61. databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
  62. databricks/labs/lakebridge/reconcile/constants.py +37 -0
  63. databricks/labs/lakebridge/reconcile/exception.py +42 -0
  64. databricks/labs/lakebridge/reconcile/execute.py +920 -0
  65. databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
  66. databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
  67. databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
  68. databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
  69. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
  70. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
  71. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
  72. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
  73. databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
  74. databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
  75. databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
  76. databricks/labs/lakebridge/reconcile/runner.py +97 -0
  77. databricks/labs/lakebridge/reconcile/sampler.py +239 -0
  78. databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
  79. databricks/labs/lakebridge/resources/__init__.py +0 -0
  80. databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
  81. databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
  82. databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
  83. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
  84. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  85. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
  86. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
  87. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  88. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
  89. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
  90. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
  91. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
  92. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
  93. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
  94. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
  95. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
  96. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
  97. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
  98. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
  99. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
  100. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
  101. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
  102. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
  103. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
  104. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  105. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
  106. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
  107. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  108. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
  109. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
  110. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
  111. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
  112. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
  113. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
  114. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
  115. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
  116. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
  117. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
  118. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
  119. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
  120. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
  121. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
  122. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
  123. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
  124. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
  125. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
  126. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
  127. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
  128. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
  129. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
  130. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
  131. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
  132. databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
  133. databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
  134. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
  135. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
  136. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
  137. databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
  138. databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
  139. databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
  140. databricks/labs/lakebridge/transpiler/__init__.py +0 -0
  141. databricks/labs/lakebridge/transpiler/execute.py +423 -0
  142. databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
  143. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
  144. databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
  145. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
  146. databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
  147. databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
  148. databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
  149. databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
  150. databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
  151. databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
  152. databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
  153. databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
  154. databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
  155. databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
  156. databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
  157. databricks/labs/lakebridge/uninstall.py +28 -0
  158. databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
  159. databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
  160. databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
  161. databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
  162. databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
  163. databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
  164. databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
  165. databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
  166. docs/lakebridge/src/components/Button.tsx +81 -0
  167. docs/lakebridge/src/css/custom.css +167 -0
  168. docs/lakebridge/src/css/table.css +20 -0
  169. docs/lakebridge/src/pages/index.tsx +57 -0
  170. docs/lakebridge/src/theme/Footer/index.tsx +24 -0
  171. docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,89 @@
1
+ from pathlib import Path
2
+ import logging
3
+ from typing import Protocol
4
+
5
+ import yaml
6
+
7
+ from databricks.labs.lakebridge.connections.env_getter import EnvGetter
8
+
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class SecretProvider(Protocol):
14
+ def get_secret(self, key: str) -> str:
15
+ pass
16
+
17
+
18
+ class LocalSecretProvider:
19
+ def get_secret(self, key: str) -> str:
20
+ return key
21
+
22
+
23
+ class EnvSecretProvider:
24
+ def __init__(self, env_getter: EnvGetter):
25
+ self._env_getter = env_getter
26
+
27
+ def get_secret(self, key: str) -> str:
28
+ try:
29
+ return self._env_getter.get(str(key))
30
+ except KeyError:
31
+ logger.debug(f"Environment variable {key} not found. Falling back to actual value")
32
+ return key
33
+
34
+
35
+ class DatabricksSecretProvider:
36
+ def get_secret(self, key: str) -> str:
37
+ raise NotImplementedError("Databricks secret vault not implemented")
38
+
39
+
40
+ class CredentialManager:
41
+ def __init__(self, credential_loader: dict, secret_providers: dict):
42
+ self._credentials = credential_loader
43
+ self._secret_providers = secret_providers
44
+ self._default_vault = self._credentials.get('secret_vault_type', 'local').lower()
45
+
46
+ def get_credentials(self, source: str) -> dict:
47
+ if source not in self._credentials:
48
+ raise KeyError(f"Source system: {source} credentials not found")
49
+
50
+ value = self._credentials[source]
51
+ if not isinstance(value, dict):
52
+ raise KeyError(f"Invalid credential format for source: {source}")
53
+
54
+ return {k: self._get_secret_value(v) for k, v in value.items()}
55
+
56
+ def _get_secret_value(self, key: str) -> str:
57
+ provider = self._secret_providers.get(self._default_vault)
58
+ if not provider:
59
+ raise ValueError(f"Unsupported secret vault type: {self._default_vault}")
60
+ return provider.get_secret(key)
61
+
62
+
63
+ def _get_home() -> Path:
64
+ return Path(__file__).home()
65
+
66
+
67
+ def cred_file(product_name) -> Path:
68
+ return Path(f"{_get_home()}/.databricks/labs/{product_name}/.credentials.yml")
69
+
70
+
71
+ def _load_credentials(path: Path) -> dict:
72
+ try:
73
+ with open(path, encoding="utf-8") as f:
74
+ return yaml.safe_load(f)
75
+ except FileNotFoundError as e:
76
+ raise FileNotFoundError(f"Credentials file not found at {path}") from e
77
+
78
+
79
+ def create_credential_manager(product_name: str, env_getter: EnvGetter):
80
+ file_path = Path(f"{_get_home()}/.databricks/labs/{product_name}/.credentials.yml")
81
+
82
+ secret_providers = {
83
+ 'local': LocalSecretProvider(),
84
+ 'env': EnvSecretProvider(env_getter),
85
+ 'databricks': DatabricksSecretProvider(),
86
+ }
87
+
88
+ loader = _load_credentials(file_path)
89
+ return CredentialManager(loader, secret_providers)
@@ -0,0 +1,98 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any
4
+
5
+ from sqlalchemy import create_engine
6
+ from sqlalchemy.engine import Engine, Result, URL
7
+ from sqlalchemy.orm import sessionmaker
8
+ from sqlalchemy import text
9
+ from sqlalchemy.exc import OperationalError
10
+
11
+ logger = logging.getLogger(__name__)
12
+ logger.setLevel("INFO")
13
+
14
+
15
+ class DatabaseConnector(ABC):
16
+ @abstractmethod
17
+ def _connect(self) -> Engine:
18
+ pass
19
+
20
+ @abstractmethod
21
+ def execute_query(self, query: str) -> Result[Any]:
22
+ pass
23
+
24
+
25
+ class _BaseConnector(DatabaseConnector):
26
+ def __init__(self, config: dict[str, Any]):
27
+ self.config = config
28
+ self.engine: Engine = self._connect()
29
+
30
+ def _connect(self) -> Engine:
31
+ raise NotImplementedError("Subclasses should implement this method")
32
+
33
+ def execute_query(self, query: str) -> Result[Any]:
34
+ if not self.engine:
35
+ raise ConnectionError("Not connected to the database.")
36
+ session = sessionmaker(bind=self.engine)
37
+ connection = session()
38
+ return connection.execute(text(query))
39
+
40
+
41
+ def _create_connector(db_type: str, config: dict[str, Any]) -> DatabaseConnector:
42
+ connectors = {
43
+ "snowflake": SnowflakeConnector,
44
+ "mssql": MSSQLConnector,
45
+ "tsql": MSSQLConnector,
46
+ "synapse": MSSQLConnector,
47
+ }
48
+
49
+ connector_class = connectors.get(db_type.lower())
50
+
51
+ if connector_class is None:
52
+ raise ValueError(f"Unsupported database type: {db_type}")
53
+
54
+ return connector_class(config)
55
+
56
+
57
+ class SnowflakeConnector(_BaseConnector):
58
+ def _connect(self) -> Engine:
59
+ raise NotImplementedError("Snowflake connector not implemented")
60
+
61
+
62
+ class MSSQLConnector(_BaseConnector):
63
+ def _connect(self) -> Engine:
64
+ query_params = {"driver": self.config['driver']}
65
+
66
+ for key, value in self.config.items():
67
+ if key not in ["user", "password", "server", "database", "port"]:
68
+ query_params[key] = value
69
+ connection_string = URL.create(
70
+ "mssql+pyodbc",
71
+ username=self.config['user'],
72
+ password=self.config['password'],
73
+ host=self.config['server'],
74
+ port=self.config.get('port', 1433),
75
+ database=self.config['database'],
76
+ query=query_params,
77
+ )
78
+ return create_engine(connection_string)
79
+
80
+
81
+ class DatabaseManager:
82
+ def __init__(self, db_type: str, config: dict[str, Any]):
83
+ self.connector = _create_connector(db_type, config)
84
+
85
+ def execute_query(self, query: str) -> Result[Any]:
86
+ try:
87
+ return self.connector.execute_query(query)
88
+ except OperationalError:
89
+ logger.error("Error connecting to the database check credentials")
90
+ raise ConnectionError("Error connecting to the database check credentials") from None
91
+
92
+ def check_connection(self) -> bool:
93
+ query = "SELECT 101 AS test_column"
94
+ result = self.execute_query(query)
95
+ row = result.fetchone()
96
+ if row is None:
97
+ return False
98
+ return row[0] == 101
@@ -0,0 +1,13 @@
1
+ import os
2
+
3
+
4
+ class EnvGetter:
5
+ """Standardised inorder to support testing Capabilities, check debug_envgetter.py"""
6
+
7
+ def __init__(self):
8
+ self.env = dict(os.environ)
9
+
10
+ def get(self, key: str) -> str:
11
+ if key in self.env:
12
+ return self.env[key]
13
+ raise KeyError(f"not in env: {key}")
File without changes
@@ -0,0 +1,133 @@
1
+ import logging
2
+ from functools import cached_property
3
+
4
+ from databricks.labs.blueprint.installation import Installation
5
+ from databricks.labs.blueprint.installer import InstallState
6
+ from databricks.labs.blueprint.upgrades import Upgrades
7
+ from databricks.labs.blueprint.tui import Prompts
8
+ from databricks.labs.blueprint.wheels import ProductInfo
9
+ from databricks.labs.lsql.backends import SqlBackend, StatementExecutionBackend
10
+ from databricks.sdk import WorkspaceClient
11
+ from databricks.sdk.config import Config
12
+ from databricks.sdk.errors import NotFound
13
+ from databricks.sdk.service.iam import User
14
+
15
+ from databricks.labs.lakebridge.config import TranspileConfig, ReconcileConfig, RemorphConfigs
16
+ from databricks.labs.lakebridge.deployment.configurator import ResourceConfigurator
17
+ from databricks.labs.lakebridge.deployment.dashboard import DashboardDeployment
18
+ from databricks.labs.lakebridge.deployment.installation import WorkspaceInstallation
19
+ from databricks.labs.lakebridge.deployment.recon import TableDeployment, JobDeployment, ReconDeployment
20
+ from databricks.labs.lakebridge.helpers.metastore import CatalogOperations
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class ApplicationContext:
26
+ def __init__(self, ws: WorkspaceClient):
27
+ self._ws = ws
28
+
29
+ def replace(self, **kwargs):
30
+ """Replace cached properties for unit testing purposes."""
31
+ for key, value in kwargs.items():
32
+ self.__dict__[key] = value
33
+ return self
34
+
35
+ @cached_property
36
+ def workspace_client(self) -> WorkspaceClient:
37
+ return self._ws
38
+
39
+ @cached_property
40
+ def current_user(self) -> User:
41
+ return self.workspace_client.current_user.me()
42
+
43
+ @cached_property
44
+ def product_info(self) -> ProductInfo:
45
+ return ProductInfo.from_class(RemorphConfigs)
46
+
47
+ @cached_property
48
+ def installation(self) -> Installation:
49
+ return Installation.assume_user_home(self.workspace_client, self.product_info.product_name())
50
+
51
+ @cached_property
52
+ def transpile_config(self) -> TranspileConfig | None:
53
+ try:
54
+ return self.installation.load(TranspileConfig)
55
+ except NotFound as err:
56
+ logger.debug(f"Couldn't find existing `transpile` installation: {err}")
57
+ return None
58
+
59
+ @cached_property
60
+ def recon_config(self) -> ReconcileConfig | None:
61
+ try:
62
+ return self.installation.load(ReconcileConfig)
63
+ except NotFound as err:
64
+ logger.debug(f"Couldn't find existing `reconcile` installation: {err}")
65
+ return None
66
+
67
+ @cached_property
68
+ def remorph_config(self) -> RemorphConfigs:
69
+ return RemorphConfigs(transpile=self.transpile_config, reconcile=self.recon_config)
70
+
71
+ @cached_property
72
+ def connect_config(self) -> Config:
73
+ return self.workspace_client.config
74
+
75
+ @cached_property
76
+ def install_state(self) -> InstallState:
77
+ return InstallState.from_installation(self.installation)
78
+
79
+ @cached_property
80
+ def sql_backend(self) -> SqlBackend:
81
+ # Installer to use only StatementExecutionBackend to eliminate the need for Databricks Connect
82
+ return StatementExecutionBackend(self.workspace_client, self.connect_config.warehouse_id)
83
+
84
+ @cached_property
85
+ def catalog_operations(self) -> CatalogOperations:
86
+ return CatalogOperations(self.workspace_client)
87
+
88
+ @cached_property
89
+ def prompts(self) -> Prompts:
90
+ return Prompts()
91
+
92
+ @cached_property
93
+ def resource_configurator(self) -> ResourceConfigurator:
94
+ return ResourceConfigurator(self.workspace_client, self.prompts, self.catalog_operations)
95
+
96
+ @cached_property
97
+ def table_deployment(self) -> TableDeployment:
98
+ return TableDeployment(self.sql_backend)
99
+
100
+ @cached_property
101
+ def job_deployment(self) -> JobDeployment:
102
+ return JobDeployment(self.workspace_client, self.installation, self.install_state, self.product_info)
103
+
104
+ @cached_property
105
+ def dashboard_deployment(self) -> DashboardDeployment:
106
+ return DashboardDeployment(self.workspace_client, self.installation, self.install_state)
107
+
108
+ @cached_property
109
+ def recon_deployment(self) -> ReconDeployment:
110
+ return ReconDeployment(
111
+ self.workspace_client,
112
+ self.installation,
113
+ self.install_state,
114
+ self.product_info,
115
+ self.table_deployment,
116
+ self.job_deployment,
117
+ self.dashboard_deployment,
118
+ )
119
+
120
+ @cached_property
121
+ def workspace_installation(self) -> WorkspaceInstallation:
122
+ return WorkspaceInstallation(
123
+ self.workspace_client,
124
+ self.prompts,
125
+ self.installation,
126
+ self.recon_deployment,
127
+ self.product_info,
128
+ self.upgrades,
129
+ )
130
+
131
+ @cached_property
132
+ def upgrades(self):
133
+ return Upgrades(self.product_info, self.installation)
File without changes
@@ -0,0 +1,223 @@
1
+ # pylint: disable=all
2
+ import collections
3
+ import dataclasses
4
+ import json
5
+ import logging
6
+ import os
7
+ import subprocess
8
+ import time
9
+ from collections.abc import Generator
10
+ from datetime import datetime, timezone
11
+ from pathlib import Path
12
+ from typing import TextIO, List
13
+
14
+ import sqlglot
15
+ from sqlglot.expressions import Expression
16
+ from sqlglot.dialects.dialect import Dialect
17
+ from sqlglot.dialects.databricks import Databricks
18
+ from sqlglot.errors import ErrorLevel
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @dataclasses.dataclass
24
+ class ReportEntry:
25
+ project: str
26
+ commit_hash: str | None
27
+ version: str
28
+ timestamp: str
29
+ source_dialect: str
30
+ target_dialect: str
31
+ file: str
32
+ parsed: int = 0 # 1 for success, 0 for failure
33
+ statements: int = 0 # number of statements parsed
34
+ transpiled: int = 0 # 1 for success, 0 for failure
35
+ transpiled_statements: int = 0 # number of statements transpiled
36
+ failures: List[dict] = dataclasses.field(default_factory=lambda: [])
37
+
38
+
39
+ def sqlglot_run_coverage(dialect, subfolder):
40
+ input_dir = get_env_var("INPUT_DIR_PARENT", required=True)
41
+ output_dir = get_env_var("OUTPUT_DIR", required=True)
42
+ sqlglot_version = sqlglot.__version__
43
+ SQLGLOT_COMMIT_HASH = "" # C0103 pylint
44
+
45
+ if not input_dir:
46
+ raise ValueError("Environment variable `INPUT_DIR_PARENT` is required")
47
+ if not output_dir:
48
+ raise ValueError("Environment variable `OUTPUT_DIR` is required")
49
+
50
+ collect_transpilation_stats(
51
+ "SQLGlot",
52
+ SQLGLOT_COMMIT_HASH,
53
+ sqlglot_version,
54
+ dialect,
55
+ Databricks,
56
+ Path(input_dir) / subfolder,
57
+ Path(output_dir),
58
+ )
59
+
60
+
61
+ def local_report(output_dir: Path):
62
+ all = collections.defaultdict(list)
63
+ for file in output_dir.rglob("*.json"):
64
+ with file.open("r", encoding="utf8") as f:
65
+ for line in f:
66
+ raw = json.loads(line)
67
+ entry = ReportEntry(**raw)
68
+ all[(entry.project, entry.source_dialect)].append(entry)
69
+ for (project, dialect), entries in sorted(all.items()):
70
+ total = len(entries)
71
+ parsed = sum(entry.parsed for entry in entries)
72
+ transpiled = sum(entry.transpiled for entry in entries)
73
+ parse_ratio = parsed / total
74
+ transpile_ratio = transpiled / total
75
+ print(
76
+ f"{project} -> {dialect}: {parse_ratio:.2%} parsed ({parsed}/{total}), "
77
+ f"{transpile_ratio:.2%} transpiled ({transpiled}/{total})"
78
+ )
79
+
80
+
81
+ def get_supported_sql_files(input_dir: Path) -> Generator[Path, None, None]:
82
+ yield from filter(lambda item: item.is_file() and item.suffix.lower() in [".sql", ".ddl"], input_dir.rglob("*"))
83
+
84
+
85
+ def write_json_line(file: TextIO, content: ReportEntry):
86
+ json.dump(dataclasses.asdict(content), file)
87
+ file.write("\n")
88
+
89
+
90
+ def get_env_var(env_var: str, *, required: bool = False) -> str | None:
91
+ """
92
+ Get the value of an environment variable.
93
+
94
+ :param env_var: The name of the environment variable to get the value of.
95
+ :param required: Indicates if the environment variable is required and raises a ValueError if it's not set.
96
+ :return: Returns the environment variable's value, or None if it's not set and not required.
97
+ """
98
+ value = os.getenv(env_var)
99
+ if value is None and required:
100
+ message = f"Environment variable {env_var} is not set"
101
+ raise ValueError(message)
102
+ return value
103
+
104
+
105
+ def get_current_commit_hash() -> str | None:
106
+ try:
107
+ return (
108
+ subprocess.check_output(
109
+ ["/usr/bin/git", "rev-parse", "--short", "HEAD"],
110
+ cwd=Path(__file__).resolve().parent,
111
+ )
112
+ .decode("ascii")
113
+ .strip()
114
+ )
115
+ except (subprocess.CalledProcessError, FileNotFoundError) as e:
116
+ logger.warning(f"Could not get the current commit hash. {e!s}")
117
+ return None
118
+
119
+
120
+ def get_current_time_utc() -> datetime:
121
+ return datetime.now(timezone.utc)
122
+
123
+
124
+ def parse_sql(sql: str, dialect: type[Dialect]) -> list[Expression]:
125
+ return [
126
+ expression for expression in sqlglot.parse(sql, read=dialect, error_level=ErrorLevel.IMMEDIATE) if expression
127
+ ]
128
+
129
+
130
+ def generate_sql(expressions: list[Expression], dialect: type[Dialect]) -> list[str]:
131
+ generator_dialect = Dialect.get_or_raise(dialect)
132
+ return [generator_dialect.generate(expression, copy=False) for expression in expressions if expression]
133
+
134
+
135
+ def _ensure_valid_io_paths(input_dir: Path, result_dir: Path):
136
+ if not input_dir.exists() or not input_dir.is_dir():
137
+ message = f"The input path {input_dir} doesn't exist or is not a directory"
138
+ raise NotADirectoryError(message)
139
+
140
+ if not result_dir.exists():
141
+ logger.info(f"Creating the output directory {result_dir}")
142
+ result_dir.mkdir(parents=True)
143
+ elif not result_dir.is_dir():
144
+ message = f"The output path {result_dir} exists but is not a directory"
145
+ raise NotADirectoryError(message)
146
+
147
+
148
+ def _get_report_file_path(
149
+ project: str,
150
+ source_dialect: type[Dialect],
151
+ target_dialect: type[Dialect],
152
+ result_dir: Path,
153
+ ) -> Path:
154
+ source_dialect_name = source_dialect.__name__
155
+ target_dialect_name = target_dialect.__name__
156
+ current_time_ns = time.time_ns()
157
+ return result_dir / f"{project}_{source_dialect_name}_{target_dialect_name}_{current_time_ns}.json".lower()
158
+
159
+
160
+ def _prepare_report_entry(
161
+ project: str,
162
+ commit_hash: str,
163
+ version: str,
164
+ source_dialect: type[Dialect],
165
+ target_dialect: type[Dialect],
166
+ file_path: str,
167
+ sql: str,
168
+ ) -> ReportEntry:
169
+ report_entry = ReportEntry(
170
+ project=project,
171
+ commit_hash=commit_hash,
172
+ version=version,
173
+ timestamp=get_current_time_utc().isoformat(),
174
+ source_dialect=source_dialect.__name__,
175
+ target_dialect=target_dialect.__name__,
176
+ file=file_path,
177
+ )
178
+ try:
179
+ expressions = parse_sql(sql, source_dialect)
180
+ report_entry.parsed = 1
181
+ report_entry.statements = len(expressions)
182
+ except Exception as pe:
183
+ report_entry.failures.append({'error_code': type(pe).__name__, 'error_message': repr(pe)})
184
+ return report_entry
185
+
186
+ try:
187
+ generated_sqls = generate_sql(expressions, target_dialect)
188
+ report_entry.transpiled = 1
189
+ report_entry.transpiled_statements = len([sql for sql in generated_sqls if sql.strip()])
190
+ except Exception as te:
191
+ report_entry.failures.append({'error_code': type(te).__name__, 'error_message': repr(te)})
192
+
193
+ return report_entry
194
+
195
+
196
+ def collect_transpilation_stats(
197
+ project: str,
198
+ commit_hash: str,
199
+ version: str,
200
+ source_dialect: type[Dialect],
201
+ target_dialect: type[Dialect],
202
+ input_dir: Path,
203
+ result_dir: Path,
204
+ ):
205
+ _ensure_valid_io_paths(input_dir, result_dir)
206
+ report_file_path = _get_report_file_path(project, source_dialect, target_dialect, result_dir)
207
+
208
+ with report_file_path.open("w", encoding="utf8") as report_file:
209
+ for input_file in get_supported_sql_files(input_dir):
210
+ with input_file.open("r", encoding="utf-8-sig") as file:
211
+ sql = file.read()
212
+
213
+ file_path = str(input_file.absolute().relative_to(input_dir.parent.absolute()))
214
+ report_entry = _prepare_report_entry(
215
+ project,
216
+ commit_hash,
217
+ version,
218
+ source_dialect,
219
+ target_dialect,
220
+ file_path,
221
+ sql,
222
+ )
223
+ write_json_line(report_file, report_entry)
@@ -0,0 +1,29 @@
1
+ from pathlib import Path
2
+
3
+ from databricks.labs.blueprint.wheels import ProductInfo
4
+ from databricks.labs.lakebridge.coverage import commons
5
+ from databricks.labs.lakebridge.transpiler.sqlglot.generator.databricks import Databricks
6
+ from databricks.labs.lakebridge.transpiler.sqlglot.parsers.snowflake import Snowflake
7
+
8
+ if __name__ == "__main__":
9
+ input_dir = commons.get_env_var("INPUT_DIR_PARENT", required=True)
10
+ output_dir = commons.get_env_var("OUTPUT_DIR", required=True)
11
+
12
+ REMORPH_COMMIT_HASH = commons.get_current_commit_hash() or "" # C0103 pylint
13
+ product_info = ProductInfo(__file__)
14
+ remorph_version = product_info.unreleased_version()
15
+
16
+ if not input_dir:
17
+ raise ValueError("Environment variable `INPUT_DIR_PARENT` is required")
18
+ if not output_dir:
19
+ raise ValueError("Environment variable `OUTPUT_DIR` is required")
20
+
21
+ commons.collect_transpilation_stats(
22
+ "Remorph",
23
+ REMORPH_COMMIT_HASH,
24
+ remorph_version,
25
+ Snowflake,
26
+ Databricks,
27
+ Path(input_dir) / 'snowflake',
28
+ Path(output_dir),
29
+ )
@@ -0,0 +1,9 @@
1
+ from pathlib import Path
2
+
3
+ from databricks.labs.lakebridge.coverage import commons
4
+
5
+ if __name__ == "__main__":
6
+ output_dir = commons.get_env_var("OUTPUT_DIR", required=True)
7
+ if not output_dir:
8
+ raise ValueError("Environment variable `OUTPUT_DIR` is required")
9
+ commons.local_report(Path(output_dir))
@@ -0,0 +1,5 @@
1
+ from sqlglot.dialects.snowflake import Snowflake
2
+ from databricks.labs.lakebridge.coverage.commons import sqlglot_run_coverage
3
+
4
+ if __name__ == "__main__":
5
+ sqlglot_run_coverage(Snowflake, "snowflake")
@@ -0,0 +1,5 @@
1
+ from sqlglot.dialects.tsql import TSQL
2
+ from databricks.labs.lakebridge.coverage.commons import sqlglot_run_coverage
3
+
4
+ if __name__ == "__main__":
5
+ sqlglot_run_coverage(TSQL, "tsql")
File without changes