databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. databricks/__init__.py +3 -0
  2. databricks/labs/__init__.py +3 -0
  3. databricks/labs/lakebridge/__about__.py +2 -0
  4. databricks/labs/lakebridge/__init__.py +11 -0
  5. databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
  6. databricks/labs/lakebridge/assessments/pipeline.py +188 -0
  7. databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
  8. databricks/labs/lakebridge/base_install.py +12 -0
  9. databricks/labs/lakebridge/cli.py +449 -0
  10. databricks/labs/lakebridge/config.py +192 -0
  11. databricks/labs/lakebridge/connections/__init__.py +0 -0
  12. databricks/labs/lakebridge/connections/credential_manager.py +89 -0
  13. databricks/labs/lakebridge/connections/database_manager.py +98 -0
  14. databricks/labs/lakebridge/connections/env_getter.py +13 -0
  15. databricks/labs/lakebridge/contexts/__init__.py +0 -0
  16. databricks/labs/lakebridge/contexts/application.py +133 -0
  17. databricks/labs/lakebridge/coverage/__init__.py +0 -0
  18. databricks/labs/lakebridge/coverage/commons.py +223 -0
  19. databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
  20. databricks/labs/lakebridge/coverage/local_report.py +9 -0
  21. databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
  22. databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
  23. databricks/labs/lakebridge/deployment/__init__.py +0 -0
  24. databricks/labs/lakebridge/deployment/configurator.py +199 -0
  25. databricks/labs/lakebridge/deployment/dashboard.py +140 -0
  26. databricks/labs/lakebridge/deployment/installation.py +125 -0
  27. databricks/labs/lakebridge/deployment/job.py +147 -0
  28. databricks/labs/lakebridge/deployment/recon.py +145 -0
  29. databricks/labs/lakebridge/deployment/table.py +30 -0
  30. databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
  31. databricks/labs/lakebridge/discovery/table.py +36 -0
  32. databricks/labs/lakebridge/discovery/table_definition.py +23 -0
  33. databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
  34. databricks/labs/lakebridge/errors/exceptions.py +1 -0
  35. databricks/labs/lakebridge/helpers/__init__.py +0 -0
  36. databricks/labs/lakebridge/helpers/db_sql.py +24 -0
  37. databricks/labs/lakebridge/helpers/execution_time.py +20 -0
  38. databricks/labs/lakebridge/helpers/file_utils.py +64 -0
  39. databricks/labs/lakebridge/helpers/metastore.py +164 -0
  40. databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
  41. databricks/labs/lakebridge/helpers/string_utils.py +62 -0
  42. databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
  43. databricks/labs/lakebridge/helpers/validation.py +101 -0
  44. databricks/labs/lakebridge/install.py +849 -0
  45. databricks/labs/lakebridge/intermediate/__init__.py +0 -0
  46. databricks/labs/lakebridge/intermediate/dag.py +88 -0
  47. databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
  48. databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
  49. databricks/labs/lakebridge/jvmproxy.py +56 -0
  50. databricks/labs/lakebridge/lineage.py +42 -0
  51. databricks/labs/lakebridge/reconcile/__init__.py +0 -0
  52. databricks/labs/lakebridge/reconcile/compare.py +414 -0
  53. databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
  54. databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
  55. databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
  56. databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
  57. databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
  58. databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
  59. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
  60. databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
  61. databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
  62. databricks/labs/lakebridge/reconcile/constants.py +37 -0
  63. databricks/labs/lakebridge/reconcile/exception.py +42 -0
  64. databricks/labs/lakebridge/reconcile/execute.py +920 -0
  65. databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
  66. databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
  67. databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
  68. databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
  69. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
  70. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
  71. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
  72. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
  73. databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
  74. databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
  75. databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
  76. databricks/labs/lakebridge/reconcile/runner.py +97 -0
  77. databricks/labs/lakebridge/reconcile/sampler.py +239 -0
  78. databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
  79. databricks/labs/lakebridge/resources/__init__.py +0 -0
  80. databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
  81. databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
  82. databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
  83. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
  84. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  85. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
  86. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
  87. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  88. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
  89. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
  90. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
  91. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
  92. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
  93. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
  94. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
  95. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
  96. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
  97. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
  98. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
  99. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
  100. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
  101. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
  102. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
  103. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
  104. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  105. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
  106. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
  107. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  108. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
  109. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
  110. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
  111. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
  112. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
  113. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
  114. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
  115. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
  116. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
  117. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
  118. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
  119. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
  120. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
  121. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
  122. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
  123. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
  124. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
  125. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
  126. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
  127. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
  128. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
  129. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
  130. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
  131. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
  132. databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
  133. databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
  134. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
  135. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
  136. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
  137. databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
  138. databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
  139. databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
  140. databricks/labs/lakebridge/transpiler/__init__.py +0 -0
  141. databricks/labs/lakebridge/transpiler/execute.py +423 -0
  142. databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
  143. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
  144. databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
  145. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
  146. databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
  147. databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
  148. databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
  149. databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
  150. databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
  151. databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
  152. databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
  153. databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
  154. databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
  155. databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
  156. databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
  157. databricks/labs/lakebridge/uninstall.py +28 -0
  158. databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
  159. databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
  160. databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
  161. databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
  162. databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
  163. databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
  164. databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
  165. databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
  166. docs/lakebridge/src/components/Button.tsx +81 -0
  167. docs/lakebridge/src/css/custom.css +167 -0
  168. docs/lakebridge/src/css/table.css +20 -0
  169. docs/lakebridge/src/pages/index.tsx +57 -0
  170. docs/lakebridge/src/theme/Footer/index.tsx +24 -0
  171. docs/lakebridge/src/theme/Layout/index.tsx +18 -0
@@ -0,0 +1,145 @@
1
+ import logging
2
+ from importlib.resources import files
3
+
4
+ from databricks.labs.blueprint.installation import Installation
5
+ from databricks.labs.blueprint.installer import InstallState
6
+ from databricks.labs.blueprint.wheels import ProductInfo
7
+ from databricks.labs.blueprint.wheels import find_project_root
8
+ from databricks.sdk import WorkspaceClient
9
+ from databricks.sdk.errors import InvalidParameterValue, NotFound
10
+
11
+ import databricks.labs.lakebridge.resources
12
+ from databricks.labs.lakebridge.config import ReconcileConfig
13
+ from databricks.labs.lakebridge.deployment.dashboard import DashboardDeployment
14
+ from databricks.labs.lakebridge.deployment.job import JobDeployment
15
+ from databricks.labs.lakebridge.deployment.table import TableDeployment
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ _RECON_PREFIX = "Reconciliation"
20
+ RECON_JOB_NAME = f"{_RECON_PREFIX} Runner"
21
+
22
+
23
+ class ReconDeployment:
24
+ def __init__(
25
+ self,
26
+ ws: WorkspaceClient,
27
+ installation: Installation,
28
+ install_state: InstallState,
29
+ product_info: ProductInfo,
30
+ table_deployer: TableDeployment,
31
+ job_deployer: JobDeployment,
32
+ dashboard_deployer: DashboardDeployment,
33
+ ):
34
+ self._ws = ws
35
+ self._installation = installation
36
+ self._install_state = install_state
37
+ self._product_info = product_info
38
+ self._table_deployer = table_deployer
39
+ self._job_deployer = job_deployer
40
+ self._dashboard_deployer = dashboard_deployer
41
+
42
+ def install(self, recon_config: ReconcileConfig | None, wheel_paths: list[str]):
43
+ if not recon_config:
44
+ logger.warning("Recon Config is empty.")
45
+ return
46
+ logger.info("Installing reconcile components.")
47
+ self._deploy_tables(recon_config)
48
+ self._deploy_dashboards(recon_config)
49
+ remorph_wheel_path = [whl for whl in wheel_paths if "remorph" in whl][0]
50
+ self._deploy_jobs(recon_config, remorph_wheel_path)
51
+ self._install_state.save()
52
+ logger.info("Installation of reconcile components completed successfully.")
53
+
54
+ def uninstall(self, recon_config: ReconcileConfig | None):
55
+ if not recon_config:
56
+ return
57
+ logger.info("Uninstalling reconcile components.")
58
+ self._remove_dashboards()
59
+ self._remove_jobs()
60
+ logging.info(
61
+ f"Won't remove reconcile metadata schema `{recon_config.metadata_config.schema}` "
62
+ f"from catalog `{recon_config.metadata_config.catalog}`. Please remove it and the tables inside manually."
63
+ )
64
+ logging.info(
65
+ f"Won't remove configured reconcile secret scope `{recon_config.secret_scope}`. "
66
+ f"Please remove it manually."
67
+ )
68
+
69
+ def _deploy_tables(self, recon_config: ReconcileConfig):
70
+ logger.info("Deploying reconciliation metadata tables.")
71
+ catalog = recon_config.metadata_config.catalog
72
+ schema = recon_config.metadata_config.schema
73
+ resources = files(databricks.labs.lakebridge.resources)
74
+ query_dir = resources.joinpath("reconcile/queries/installation")
75
+
76
+ sqls_to_deploy = [
77
+ "main.sql",
78
+ "metrics.sql",
79
+ "details.sql",
80
+ "aggregate_metrics.sql",
81
+ "aggregate_details.sql",
82
+ "aggregate_rules.sql",
83
+ ]
84
+
85
+ for sql_file in sqls_to_deploy:
86
+ table_sql_file = query_dir.joinpath(sql_file)
87
+ self._table_deployer.deploy_table_from_ddl_file(catalog, schema, sql_file.strip(".sql"), table_sql_file)
88
+
89
+ def _deploy_dashboards(self, recon_config: ReconcileConfig):
90
+ logger.info("Deploying reconciliation dashboards.")
91
+ dashboard_base_dir = (
92
+ find_project_root(__file__) / "src/databricks/labs/lakebridge/resources/reconcile/dashboards"
93
+ )
94
+ self._dashboard_deployer.deploy(dashboard_base_dir, recon_config)
95
+
96
+ def _get_dashboards(self) -> list[tuple[str, str]]:
97
+ return list(self._install_state.dashboards.items())
98
+
99
+ def _remove_dashboards(self):
100
+ logger.info("Removing reconciliation dashboards.")
101
+ for dashboard_ref, dashboard_id in self._get_dashboards():
102
+ try:
103
+ logger.info(f"Removing dashboard with id={dashboard_id}.")
104
+ del self._install_state.dashboards[dashboard_ref]
105
+ self._ws.lakeview.trash(dashboard_id)
106
+ except (InvalidParameterValue, NotFound):
107
+ logger.warning(f"Dashboard with id={dashboard_id} doesn't exist anymore for some reason.")
108
+ continue
109
+
110
+ def _deploy_jobs(self, recon_config: ReconcileConfig, remorph_wheel_path: str):
111
+ logger.info("Deploying reconciliation jobs.")
112
+ self._job_deployer.deploy_recon_job(RECON_JOB_NAME, recon_config, remorph_wheel_path)
113
+ for job_name, job_id in self._get_deprecated_jobs():
114
+ try:
115
+ logger.info(f"Removing job_id={job_id}, as it is no longer needed.")
116
+ del self._install_state.jobs[job_name]
117
+ self._ws.jobs.delete(job_id)
118
+ except (InvalidParameterValue, NotFound):
119
+ logger.warning(f"{job_name} doesn't exist anymore for some reason.")
120
+ continue
121
+
122
+ def _get_jobs(self) -> list[tuple[str, int]]:
123
+ return [
124
+ (job_name, int(job_id))
125
+ for job_name, job_id in self._install_state.jobs.items()
126
+ if job_name.startswith(_RECON_PREFIX)
127
+ ]
128
+
129
+ def _get_deprecated_jobs(self) -> list[tuple[str, int]]:
130
+ return [
131
+ (job_name, int(job_id))
132
+ for job_name, job_id in self._install_state.jobs.items()
133
+ if job_name.startswith(_RECON_PREFIX) and job_name != RECON_JOB_NAME
134
+ ]
135
+
136
+ def _remove_jobs(self):
137
+ logger.info("Removing Reconciliation Jobs.")
138
+ for job_name, job_id in self._get_jobs():
139
+ try:
140
+ logger.info(f"Removing job {job_name} with job_id={job_id}.")
141
+ del self._install_state.jobs[job_name]
142
+ self._ws.jobs.delete(int(job_id))
143
+ except (InvalidParameterValue, NotFound):
144
+ logger.warning(f"{job_name} doesn't exist anymore for some reason.")
145
+ continue
@@ -0,0 +1,30 @@
1
+ import logging
2
+ from importlib.abc import Traversable
3
+
4
+ from databricks.labs.lsql.backends import SqlBackend
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class TableDeployment:
10
+ def __init__(self, sql_backend: SqlBackend):
11
+ self._sql_backend = sql_backend
12
+
13
+ def deploy_table_from_ddl_file(
14
+ self,
15
+ catalog: str,
16
+ schema: str,
17
+ table_name: str,
18
+ ddl_query_filepath: Traversable,
19
+ ):
20
+ """
21
+ Deploys a table to the given catalog and schema
22
+ :param catalog: The table catalog
23
+ :param schema: The table schema
24
+ :param table_name: The table to deploy
25
+ :param ddl_query_filepath: DDL file path
26
+ """
27
+ query = ddl_query_filepath.read_text()
28
+ logger.info(f"Deploying table {table_name} in {catalog}.{schema}")
29
+ logger.info(f"SQL Backend used for deploying table: {type(self._sql_backend).__name__}")
30
+ self._sql_backend.execute(query, catalog=catalog, schema=schema)
@@ -0,0 +1,124 @@
1
+ import logging
2
+ import re
3
+ from importlib.resources import files
4
+
5
+ import databricks.labs.lakebridge.resources
6
+
7
+ from databricks.labs.blueprint.tui import Prompts
8
+ from databricks.sdk import WorkspaceClient
9
+ from databricks.labs.lakebridge.helpers import db_sql
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def replace_patterns(sql_text: str) -> str:
15
+ """
16
+ Replace the STRUCT and MAP datatypes in the SQL text with empty string
17
+ """
18
+ # Pattern to match nested STRUCT and MAP datatypes
19
+ pattern = r'(STRUCT<[^<>]*?(?:<[^<>]*?>[^<>]*?)*>|MAP<[^<>]*?(?:<[^<>]*?>[^<>]*?)*>)'
20
+ parsed_sql_text = re.sub(pattern, "", sql_text, flags=re.DOTALL)
21
+ return parsed_sql_text
22
+
23
+
24
+ def extract_columns_with_datatype(sql_text: str) -> list[str]:
25
+ """
26
+ Extract the columns with datatype from the SQL text
27
+ Example:
28
+ Input: CREATE TABLE main (
29
+ recon_table_id BIGINT NOT NULL,
30
+ report_type STRING NOT NULL
31
+ );
32
+ Output: [recon_table_id BIGINT NOT NULL,
33
+ report_type STRING NOT NULL]
34
+ """
35
+ return sql_text[sql_text.index("(") + 1 : sql_text.index(")")].strip().split(",")
36
+
37
+
38
+ def extract_column_name(column_with_datatype: str) -> str:
39
+ """
40
+ Extract the column name from the column with datatype.
41
+ Example:
42
+ Input: \n recon_table_id BIGINT NOT NULL,
43
+ Output: recon_table_id
44
+ """
45
+ return column_with_datatype.strip("\n").strip().split(" ")[0]
46
+
47
+
48
+ def table_original_query(table_name: str, full_table_name: str) -> str:
49
+ """
50
+ Get the main table DDL from the main.sql file
51
+ :return: str
52
+ """
53
+ resources = files(databricks.labs.lakebridge.resources)
54
+ query_dir = resources.joinpath("reconcile/queries/installation")
55
+ return (
56
+ query_dir.joinpath(f"{table_name}.sql")
57
+ .read_text()
58
+ .replace(f"CREATE TABLE IF NOT EXISTS {table_name}", f"CREATE OR REPLACE TABLE {full_table_name}")
59
+ )
60
+
61
+
62
+ def current_table_columns(table_name: str, full_table_name: str) -> list[str]:
63
+ """
64
+ Extract the column names from the main table DDL
65
+ :return: column_names: list[str]
66
+ """
67
+ main_sql = replace_patterns(table_original_query(table_name, full_table_name))
68
+ main_table_columns = [
69
+ extract_column_name(main_table_column) for main_table_column in extract_columns_with_datatype(main_sql)
70
+ ]
71
+ return main_table_columns
72
+
73
+
74
+ def installed_table_columns(ws: WorkspaceClient, table_identifier: str) -> list[str]:
75
+ """
76
+ Fetch the column names from the installed table on Databricks Workspace using SQL Backend
77
+ :return: column_names: list[str]
78
+ """
79
+ main_table_columns = list(db_sql.get_sql_backend(ws).fetch(f"DESC {table_identifier}"))
80
+ return [row.col_name for row in main_table_columns]
81
+
82
+
83
+ def check_table_mismatch(
84
+ installed_table,
85
+ current_table,
86
+ ) -> bool:
87
+ # Compare the current main table columns with the installed main table columns
88
+ if len(installed_table) != len(current_table) or sorted(installed_table) != sorted(current_table):
89
+ return True
90
+ return False
91
+
92
+
93
+ def recreate_table_sql(
94
+ table_identifier: str,
95
+ installed_table: list[str],
96
+ current_table: list[str],
97
+ prompts: Prompts,
98
+ ) -> str | None:
99
+ """
100
+ * Verify all the current main table columns are present in the installed main table and then use CTAS to recreate the main table
101
+ * If any of the current main table columns are missing in the installed main table, prompt the user to recreate the main table:
102
+ - If the user confirms, recreate the main table using the main DDL file, else log an error message and exit
103
+ :param table_identifier:
104
+ :param installed_table:
105
+ :param current_table:
106
+ :param prompts:
107
+ :return:
108
+ """
109
+ table_name = table_identifier.split('.')[-1]
110
+ sql: str | None = (
111
+ f"CREATE OR REPLACE TABLE {table_identifier} AS SELECT {','.join(current_table)} FROM {table_identifier}"
112
+ )
113
+
114
+ if not set(current_table).issubset(installed_table):
115
+ if prompts.confirm(
116
+ f"The `{table_identifier}` table columns are not as expected. Do you want to recreate the `{table_identifier}` table?"
117
+ ):
118
+ sql = table_original_query(table_name, table_identifier)
119
+ else:
120
+ logger.error(
121
+ f"The `{table_identifier}` table columns are not as expected. Please check and recreate the `{table_identifier}` table."
122
+ )
123
+ sql = None
124
+ return sql
@@ -0,0 +1,36 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Any
3
+
4
+
5
+ @dataclass
6
+ class TableFQN:
7
+ catalog: str | None
8
+ schema: str
9
+ name: str
10
+
11
+ @property
12
+ def fqn(self) -> str:
13
+ if self.catalog:
14
+ return f"{self.catalog}.{self.schema}.{self.name}"
15
+ return f"{self.schema}.{self.name}"
16
+
17
+
18
+ @dataclass
19
+ class FieldInfo:
20
+ name: str
21
+ data_type: str
22
+ nullable: bool | None = None
23
+ metadata: dict[str, Any] | None = None
24
+ comment: str | None = None
25
+
26
+
27
+ @dataclass
28
+ class TableDefinition:
29
+ fqn: TableFQN
30
+ location: str | None = None
31
+ table_format: str | None = None
32
+ view_text: str | None = None
33
+ columns: list[FieldInfo] = field(default_factory=list)
34
+ primary_keys: list[str] | None = None
35
+ size_gb: int | None = None
36
+ comment: str | None = None
@@ -0,0 +1,23 @@
1
+ from abc import ABC, abstractmethod
2
+ from collections.abc import Iterable
3
+ from typing import Any
4
+
5
+ from databricks.labs.lakebridge.discovery.table import TableDefinition
6
+
7
+
8
+ class TableDefinitionService(ABC):
9
+
10
+ def __init__(self, connection: Any):
11
+ self.connection = connection
12
+
13
+ @abstractmethod
14
+ def get_table_definition(self, catalog_name: str) -> Iterable[TableDefinition]:
15
+ pass
16
+
17
+ @abstractmethod
18
+ def _get_table_definition_query(self, catalog_name: str) -> str:
19
+ pass
20
+
21
+ @abstractmethod
22
+ def get_all_catalog(self) -> Iterable[str]:
23
+ pass
@@ -0,0 +1,185 @@
1
+ from collections.abc import Iterable
2
+
3
+ from databricks.labs.lakebridge.connections.database_manager import DatabaseManager
4
+ from databricks.labs.lakebridge.discovery.table import TableDefinition, TableFQN, FieldInfo
5
+ from databricks.labs.lakebridge.discovery.table_definition import TableDefinitionService
6
+
7
+
8
+ class TsqlTableDefinitionService(TableDefinitionService):
9
+
10
+ # Hexadecimal value of § is U+00A7.Hexadecimal value of ‡ (double dagger) is U+2021
11
+ def _get_table_definition_query(self, catalog_name: str) -> str:
12
+ query = f"""
13
+ WITH column_info AS (
14
+ SELECT
15
+ TABLE_CATALOG,
16
+ TABLE_SCHEMA,
17
+ TABLE_NAME,
18
+ STRING_AGG(
19
+ CONCAT(
20
+ column_name,
21
+ '§',
22
+ CASE
23
+ WHEN numeric_precision IS NOT NULL AND numeric_scale IS NOT NULL THEN CONCAT(data_type, '(', numeric_precision, ',', numeric_scale, ')')
24
+ WHEN LOWER(data_type) = 'text' THEN CONCAT('varchar', '(', CHARACTER_MAXIMUM_LENGTH, ')')
25
+ ELSE data_type
26
+ END,
27
+ '§',
28
+ CASE
29
+ WHEN cis.IS_NULLABLE = 'YES' THEN 'true'
30
+ ELSE 'false'
31
+ END,
32
+ '§',
33
+ ISNULL(CAST(ep_col.value AS NVARCHAR(MAX)), '')
34
+ ),
35
+ '‡'
36
+ ) WITHIN GROUP (ORDER BY ordinal_position) AS DERIVED_SCHEMA
37
+ FROM
38
+ {catalog_name}.sys.tables t
39
+ INNER JOIN {catalog_name}.sys.columns c ON t.object_id = c.object_id
40
+ INNER JOIN {catalog_name}.INFORMATION_SCHEMA.COLUMNS cis ON t.name = cis.TABLE_NAME AND c.name = cis.COLUMN_NAME
41
+ OUTER APPLY (
42
+ SELECT TOP 1 value
43
+ FROM {catalog_name}.sys.extended_properties
44
+ WHERE major_id = t.object_id AND minor_id = 0
45
+ ORDER BY name DESC
46
+ ) ep_tbl
47
+ OUTER APPLY (
48
+ SELECT TOP 1 value
49
+ FROM {catalog_name}.sys.extended_properties
50
+ WHERE major_id = c.object_id AND minor_id = c.column_id
51
+ ORDER BY name DESC
52
+ ) ep_col
53
+ GROUP BY
54
+ TABLE_CATALOG,
55
+ TABLE_SCHEMA,
56
+ TABLE_NAME
57
+ ),
58
+ table_file_info AS (
59
+ SELECT
60
+ s.name AS TABLE_SCHEMA,
61
+ t.name AS TABLE_NAME,
62
+ f.physical_name AS location,
63
+ f.type_desc AS TABLE_FORMAT,
64
+ CAST(ROUND(SUM(a.used_pages) * 8.0 / 1024, 2) AS DECIMAL(18, 2)) AS SIZE_GB
65
+ FROM
66
+ {catalog_name}.sys.tables t
67
+ INNER JOIN {catalog_name}.sys.indexes i ON t.object_id = i.object_id
68
+ INNER JOIN {catalog_name}.sys.partitions p ON i.object_id = p.object_id AND i.index_id = p.index_id
69
+ INNER JOIN {catalog_name}.sys.allocation_units a ON p.partition_id = a.container_id
70
+ INNER JOIN {catalog_name}.sys.schemas s ON t.schema_id = s.schema_id
71
+ INNER JOIN {catalog_name}.sys.database_files f ON a.data_space_id = f.data_space_id
72
+ LEFT JOIN {catalog_name}.sys.extended_properties ep ON ep.major_id = t.object_id AND ep.minor_id = 0
73
+ GROUP BY
74
+ s.name,
75
+ t.name,
76
+ f.name,
77
+ f.physical_name,
78
+ f.type_desc
79
+ ),
80
+ table_comment_info AS (
81
+ SELECT
82
+ s.name AS TABLE_SCHEMA,
83
+ t.name AS TABLE_NAME,
84
+ CAST(ep.value AS NVARCHAR(MAX)) AS TABLE_COMMENT
85
+ FROM
86
+ {catalog_name}.sys.tables t
87
+ INNER JOIN {catalog_name}.sys.schemas s ON t.schema_id = s.schema_id
88
+ OUTER APPLY (
89
+ SELECT TOP 1 value
90
+ FROM {catalog_name}.sys.extended_properties
91
+ WHERE major_id = t.object_id AND minor_id = 0
92
+ ORDER BY name DESC
93
+ ) ep
94
+ ),
95
+ table_pk_info AS (
96
+ SELECT
97
+ TC.TABLE_CATALOG,
98
+ TC.TABLE_SCHEMA,
99
+ TC.TABLE_NAME,
100
+ STRING_AGG(KU.COLUMN_NAME,':') as PK_COLUMN_NAME
101
+ FROM {catalog_name}.INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS TC
102
+ JOIN {catalog_name}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS KU
103
+ ON TC.CONSTRAINT_NAME = KU.CONSTRAINT_NAME
104
+ AND TC.TABLE_NAME = KU.TABLE_NAME
105
+ WHERE TC.CONSTRAINT_TYPE = 'PRIMARY KEY' group by TC.TABLE_CATALOG, TC.TABLE_SCHEMA, TC.TABLE_NAME)
106
+ SELECT
107
+ sft.TABLE_CATALOG,
108
+ sft.TABLE_SCHEMA,
109
+ sft.TABLE_NAME,
110
+ tfi.location,
111
+ tfi.TABLE_FORMAT,
112
+ '' as view_definition,
113
+ column_info.DERIVED_SCHEMA,
114
+ tfi.SIZE_GB,
115
+ tci.TABLE_COMMENT,
116
+ tpK.PK_COLUMN_NAME
117
+ FROM
118
+ column_info
119
+ JOIN {catalog_name}.INFORMATION_SCHEMA.TABLES sft ON column_info.TABLE_CATALOG = sft.TABLE_CATALOG AND column_info.TABLE_SCHEMA = sft.TABLE_SCHEMA AND column_info.TABLE_NAME = sft.TABLE_NAME
120
+ LEFT JOIN table_file_info tfi ON column_info.TABLE_SCHEMA = tfi.TABLE_SCHEMA AND column_info.TABLE_NAME = tfi.TABLE_NAME
121
+ LEFT JOIN table_comment_info tci ON column_info.TABLE_SCHEMA = tci.TABLE_SCHEMA AND column_info.TABLE_NAME = tci.TABLE_NAME
122
+ LEFT JOIN table_pk_info tpK ON column_info.TABLE_SCHEMA = tpK.TABLE_SCHEMA AND column_info.TABLE_NAME = tpK.TABLE_NAME
123
+
124
+ UNION ALL
125
+ SELECT
126
+ sfv.TABLE_CATALOG,
127
+ sfv.TABLE_SCHEMA,
128
+ sfv.TABLE_NAME,
129
+ '' location,
130
+ '' TABLE_FORMAT,
131
+ sfv.view_definition,
132
+ '' DERIVED_SCHEMA,
133
+ 0 SIZE_GB,
134
+ '' TABLE_COMMENT,
135
+ '' PK_COLUMN_NAME
136
+ FROM {catalog_name}.INFORMATION_SCHEMA.VIEWS sfv
137
+ """
138
+ return query
139
+
140
+ def get_table_definition(self, catalog_name: str) -> Iterable[TableDefinition]:
141
+ sql = self._get_table_definition_query(catalog_name)
142
+ tsql_connection = self.connection
143
+ result = tsql_connection.execute_query(sql)
144
+
145
+ column_names = list(result.keys())
146
+ table_definitions = []
147
+
148
+ for row in result:
149
+ result = dict(zip(column_names, row))
150
+ table_fqn = TableFQN(
151
+ catalog=result["TABLE_CATALOG"], schema=result["TABLE_SCHEMA"], name=result["TABLE_NAME"]
152
+ )
153
+ columns = result["DERIVED_SCHEMA"].split("‡") if result["DERIVED_SCHEMA"] else None
154
+ field_info = []
155
+ if columns is not None:
156
+ for column in columns:
157
+ column_info = column.split("§")
158
+ field = FieldInfo(
159
+ name=column_info[0],
160
+ data_type=column_info[1],
161
+ nullable=column_info[2],
162
+ comment=column_info[3],
163
+ )
164
+ field_info.append(field)
165
+
166
+ pks = result["PK_COLUMN_NAME"].split(":") if result["PK_COLUMN_NAME"] else None
167
+ table_definition = TableDefinition(
168
+ fqn=table_fqn,
169
+ location=result["location"],
170
+ table_format=result["TABLE_FORMAT"],
171
+ view_text=result["view_definition"],
172
+ columns=field_info,
173
+ size_gb=result["SIZE_GB"],
174
+ comment=result["TABLE_COMMENT"],
175
+ primary_keys=pks,
176
+ )
177
+ table_definitions.append(table_definition)
178
+ return table_definitions
179
+
180
+ def get_all_catalog(self) -> Iterable[str]:
181
+ cursor: DatabaseManager = self.connection
182
+ result = cursor.connector.execute_query("""select name from sys.databases""")
183
+ catalogs = [row[0] for row in result]
184
+ print(catalogs)
185
+ return catalogs
@@ -0,0 +1 @@
1
+ class IllegalStateException(BaseException): ...
File without changes
@@ -0,0 +1,24 @@
1
+ import logging
2
+ import os
3
+
4
+ from databricks.labs.lsql.backends import (
5
+ DatabricksConnectBackend,
6
+ RuntimeBackend,
7
+ SqlBackend,
8
+ StatementExecutionBackend,
9
+ )
10
+ from databricks.sdk import WorkspaceClient
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def get_sql_backend(ws: WorkspaceClient, warehouse_id: str | None = None) -> SqlBackend:
16
+ warehouse_id = warehouse_id or ws.config.warehouse_id
17
+ if warehouse_id:
18
+ logger.info(f"Using SQL backend with warehouse_id: {warehouse_id}")
19
+ return StatementExecutionBackend(ws, warehouse_id)
20
+ if "DATABRICKS_RUNTIME_VERSION" in os.environ:
21
+ logger.info("Using SQL backend with Databricks Runtime.")
22
+ return RuntimeBackend()
23
+ logger.info("Using SQL backend with Databricks Connect.")
24
+ return DatabricksConnectBackend(ws)
@@ -0,0 +1,20 @@
1
+ import inspect
2
+ import logging
3
+ import time
4
+ from functools import wraps
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def timeit(func):
10
+ @wraps(func)
11
+ def timeit_wrapper(*args, **kwargs):
12
+ start_time = time.perf_counter()
13
+ result = func(*args, **kwargs)
14
+ end_time = time.perf_counter()
15
+ total_time = end_time - start_time
16
+ name = inspect.getmodule(func).__name__.split(".")[3].capitalize()
17
+ logger.info(f"{name} took {total_time:.4f} seconds")
18
+ return result
19
+
20
+ return timeit_wrapper
@@ -0,0 +1,64 @@
1
+ from pathlib import Path
2
+ from collections.abc import Generator
3
+
4
+
5
+ def is_sql_file(file: str | Path) -> bool:
6
+ """
7
+ Checks if the given file is a SQL file.
8
+
9
+ :param file: The name of the file to check.
10
+ :return: True if the file is a SQL file (i.e., its extension is either .sql or .ddl), False otherwise.
11
+ """
12
+ file_extension = Path(file).suffix
13
+ return file_extension.lower() in {".sql", ".ddl"}
14
+
15
+
16
+ def is_dbt_project_file(file: Path):
17
+ # it's ok to hardcode the file name here, see https://docs.getdbt.com/reference/dbt_project.yml
18
+ return file.name == "dbt_project.yml"
19
+
20
+
21
+ def make_dir(path: str | Path) -> None:
22
+ """
23
+ Creates a directory at the specified path if it does not already exist.
24
+
25
+ :param path: The path where the directory should be created.
26
+ """
27
+ Path(path).mkdir(parents=True, exist_ok=True)
28
+
29
+
30
+ def dir_walk(root: Path):
31
+ """
32
+ Walks the directory tree rooted at the given path, yielding a tuple containing the root directory, a list of
33
+ :param root: Path
34
+ :return: tuple of root, subdirectory , files
35
+ """
36
+ sub_dirs = [d for d in root.iterdir() if d.is_dir()]
37
+ files = [f for f in root.iterdir() if f.is_file()]
38
+ yield root, sub_dirs, files
39
+
40
+ for each_dir in sub_dirs:
41
+ yield from dir_walk(each_dir)
42
+
43
+
44
+ def get_sql_file(input_path: str | Path) -> Generator[Path, None, None]:
45
+ """
46
+ Returns Generator that yields the names of all SQL files in the given directory.
47
+ :param input_path: Path
48
+ :return: List of SQL files
49
+ """
50
+ for _, _, files in dir_walk(Path(input_path)):
51
+ for filename in files:
52
+ if is_sql_file(filename):
53
+ yield filename
54
+
55
+
56
+ def read_file(filename: str | Path) -> str:
57
+ """
58
+ Reads the contents of the given file and returns it as a string.
59
+ :param filename: Input File Path
60
+ :return: File Contents as String
61
+ """
62
+ # pylint: disable=unspecified-encoding
63
+ with Path(filename).open() as file:
64
+ return file.read()