databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. databricks/__init__.py +3 -0
  2. databricks/labs/__init__.py +3 -0
  3. databricks/labs/lakebridge/__about__.py +2 -0
  4. databricks/labs/lakebridge/__init__.py +11 -0
  5. databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
  6. databricks/labs/lakebridge/assessments/pipeline.py +188 -0
  7. databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
  8. databricks/labs/lakebridge/base_install.py +12 -0
  9. databricks/labs/lakebridge/cli.py +449 -0
  10. databricks/labs/lakebridge/config.py +192 -0
  11. databricks/labs/lakebridge/connections/__init__.py +0 -0
  12. databricks/labs/lakebridge/connections/credential_manager.py +89 -0
  13. databricks/labs/lakebridge/connections/database_manager.py +98 -0
  14. databricks/labs/lakebridge/connections/env_getter.py +13 -0
  15. databricks/labs/lakebridge/contexts/__init__.py +0 -0
  16. databricks/labs/lakebridge/contexts/application.py +133 -0
  17. databricks/labs/lakebridge/coverage/__init__.py +0 -0
  18. databricks/labs/lakebridge/coverage/commons.py +223 -0
  19. databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
  20. databricks/labs/lakebridge/coverage/local_report.py +9 -0
  21. databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
  22. databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
  23. databricks/labs/lakebridge/deployment/__init__.py +0 -0
  24. databricks/labs/lakebridge/deployment/configurator.py +199 -0
  25. databricks/labs/lakebridge/deployment/dashboard.py +140 -0
  26. databricks/labs/lakebridge/deployment/installation.py +125 -0
  27. databricks/labs/lakebridge/deployment/job.py +147 -0
  28. databricks/labs/lakebridge/deployment/recon.py +145 -0
  29. databricks/labs/lakebridge/deployment/table.py +30 -0
  30. databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
  31. databricks/labs/lakebridge/discovery/table.py +36 -0
  32. databricks/labs/lakebridge/discovery/table_definition.py +23 -0
  33. databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
  34. databricks/labs/lakebridge/errors/exceptions.py +1 -0
  35. databricks/labs/lakebridge/helpers/__init__.py +0 -0
  36. databricks/labs/lakebridge/helpers/db_sql.py +24 -0
  37. databricks/labs/lakebridge/helpers/execution_time.py +20 -0
  38. databricks/labs/lakebridge/helpers/file_utils.py +64 -0
  39. databricks/labs/lakebridge/helpers/metastore.py +164 -0
  40. databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
  41. databricks/labs/lakebridge/helpers/string_utils.py +62 -0
  42. databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
  43. databricks/labs/lakebridge/helpers/validation.py +101 -0
  44. databricks/labs/lakebridge/install.py +849 -0
  45. databricks/labs/lakebridge/intermediate/__init__.py +0 -0
  46. databricks/labs/lakebridge/intermediate/dag.py +88 -0
  47. databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
  48. databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
  49. databricks/labs/lakebridge/jvmproxy.py +56 -0
  50. databricks/labs/lakebridge/lineage.py +42 -0
  51. databricks/labs/lakebridge/reconcile/__init__.py +0 -0
  52. databricks/labs/lakebridge/reconcile/compare.py +414 -0
  53. databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
  54. databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
  55. databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
  56. databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
  57. databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
  58. databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
  59. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
  60. databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
  61. databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
  62. databricks/labs/lakebridge/reconcile/constants.py +37 -0
  63. databricks/labs/lakebridge/reconcile/exception.py +42 -0
  64. databricks/labs/lakebridge/reconcile/execute.py +920 -0
  65. databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
  66. databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
  67. databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
  68. databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
  69. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
  70. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
  71. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
  72. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
  73. databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
  74. databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
  75. databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
  76. databricks/labs/lakebridge/reconcile/runner.py +97 -0
  77. databricks/labs/lakebridge/reconcile/sampler.py +239 -0
  78. databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
  79. databricks/labs/lakebridge/resources/__init__.py +0 -0
  80. databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
  81. databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
  82. databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
  83. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
  84. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  85. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
  86. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
  87. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  88. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
  89. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
  90. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
  91. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
  92. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
  93. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
  94. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
  95. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
  96. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
  97. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
  98. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
  99. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
  100. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
  101. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
  102. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
  103. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
  104. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  105. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
  106. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
  107. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  108. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
  109. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
  110. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
  111. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
  112. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
  113. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
  114. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
  115. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
  116. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
  117. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
  118. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
  119. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
  120. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
  121. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
  122. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
  123. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
  124. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
  125. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
  126. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
  127. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
  128. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
  129. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
  130. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
  131. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
  132. databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
  133. databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
  134. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
  135. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
  136. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
  137. databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
  138. databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
  139. databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
  140. databricks/labs/lakebridge/transpiler/__init__.py +0 -0
  141. databricks/labs/lakebridge/transpiler/execute.py +423 -0
  142. databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
  143. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
  144. databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
  145. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
  146. databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
  147. databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
  148. databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
  149. databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
  150. databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
  151. databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
  152. databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
  153. databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
  154. databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
  155. databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
  156. databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
  157. databricks/labs/lakebridge/uninstall.py +28 -0
  158. databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
  159. databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
  160. databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
  161. databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
  162. databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
  163. databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
  164. databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
  165. databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
  166. docs/lakebridge/src/components/Button.tsx +81 -0
  167. docs/lakebridge/src/css/custom.css +167 -0
  168. docs/lakebridge/src/css/table.css +20 -0
  169. docs/lakebridge/src/pages/index.tsx +57 -0
  170. docs/lakebridge/src/theme/Footer/index.tsx +24 -0
  171. docs/lakebridge/src/theme/Layout/index.tsx +18 -0
databricks/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ # DO NOT ADD ANYTHING ELSE TO THIS FILE FOR COMPATIBILITY WITH OTHER databricks.* PACKAGES
2
+ # SEE https://packaging.python.org/guides/packaging-namespace-packages/#pkgutil-style-namespace-packages
3
+ __path__ = __import__("pkgutil").extend_path(__path__, __name__)
@@ -0,0 +1,3 @@
1
+ # DO NOT ADD ANYTHING ELSE TO THIS FILE FOR COMPATIBILITY WITH OTHER databricks.* PACKAGES
2
+ # SEE https://packaging.python.org/guides/packaging-namespace-packages/#pkgutil-style-namespace-packages
3
+ __path__ = __import__("pkgutil").extend_path(__path__, __name__)
@@ -0,0 +1,2 @@
1
+ # DO NOT MODIFY THIS FILE
2
+ __version__ = "0.10.0"
@@ -0,0 +1,11 @@
1
+ from databricks.sdk.core import with_user_agent_extra, with_product
2
+ from databricks.labs.blueprint.logger import install_logger
3
+ from databricks.labs.lakebridge.__about__ import __version__
4
+
5
+ install_logger()
6
+
7
+ # Add lakebridge/<version> for projects depending on lakebridge as a library
8
+ with_user_agent_extra("lakebridge", __version__)
9
+
10
+ # Add lakebridge/<version> for re-packaging of lakebridge, where product name is omitted
11
+ with_product("lakebridge", __version__)
@@ -0,0 +1,194 @@
1
+ from abc import ABC, abstractmethod
2
+ import logging
3
+ import shutil
4
+ import yaml
5
+
6
+ from databricks.labs.blueprint.tui import Prompts
7
+
8
+ from databricks.labs.lakebridge.connections.credential_manager import (
9
+ cred_file as creds,
10
+ CredentialManager,
11
+ create_credential_manager,
12
+ )
13
+ from databricks.labs.lakebridge.connections.database_manager import DatabaseManager
14
+ from databricks.labs.lakebridge.connections.env_getter import EnvGetter
15
+
16
+ logger = logging.getLogger(__name__)
17
+ logger.setLevel(logging.INFO)
18
+
19
+ PROFILER_SOURCE_SYSTEM = ["mssql", "synapse"]
20
+
21
+
22
+ class AssessmentConfigurator(ABC):
23
+ """Abstract base class for assessment configuration."""
24
+
25
+ def __init__(self, product_name: str, prompts: Prompts, credential_file=None):
26
+ self.prompts = prompts
27
+ self._product_name = product_name
28
+ self._credential_file = creds(product_name) if not credential_file else credential_file
29
+
30
+ @abstractmethod
31
+ def _configure_credentials(self) -> str:
32
+ pass
33
+
34
+ @staticmethod
35
+ def _test_connection(source: str, cred_manager: CredentialManager):
36
+ config = cred_manager.get_credentials(source)
37
+
38
+ try:
39
+ db_manager = DatabaseManager(source, config)
40
+ if db_manager.check_connection():
41
+ logger.info("Connection to the source system successful")
42
+ else:
43
+ logger.error("Connection to the source system failed, check logs in debug mode")
44
+ raise SystemExit("Connection validation failed. Exiting...")
45
+
46
+ except ConnectionError as e:
47
+ logger.error(f"Failed to connect to the source system: {e}")
48
+ raise SystemExit("Connection validation failed. Exiting...") from e
49
+
50
+ def run(self):
51
+ """Run the assessment configuration process."""
52
+ logger.info(f"Welcome to the {self._product_name} Assessment Configuration")
53
+ source = self._configure_credentials()
54
+ logger.info(f"{source.capitalize()} details and credentials received.")
55
+ if self.prompts.confirm(f"Do you want to test the connection to {source}?"):
56
+ cred_manager = create_credential_manager("lakebridge", EnvGetter())
57
+ if cred_manager:
58
+ self._test_connection(source, cred_manager)
59
+ logger.info(f"{source.capitalize()} Assessment Configuration Completed")
60
+
61
+
62
+ class ConfigureSqlServerAssessment(AssessmentConfigurator):
63
+ """SQL Server specific assessment configuration."""
64
+
65
+ def _configure_credentials(self) -> str:
66
+ cred_file = self._credential_file
67
+ source = "mssql"
68
+
69
+ logger.info(
70
+ "\n(local | env) \nlocal means values are read as plain text \nenv means values are read "
71
+ "from environment variables fall back to plain text if not variable is not found\n",
72
+ )
73
+ secret_vault_type = str(self.prompts.choice("Enter secret vault type (local | env)", ["local", "env"])).lower()
74
+ secret_vault_name = None
75
+
76
+ logger.info("Please refer to the documentation to understand the difference between local and env.")
77
+
78
+ credential = {
79
+ "secret_vault_type": secret_vault_type,
80
+ "secret_vault_name": secret_vault_name,
81
+ source: {
82
+ "database": self.prompts.question("Enter the database name"),
83
+ "driver": self.prompts.question("Enter the driver details"),
84
+ "server": self.prompts.question("Enter the server or host details"),
85
+ "port": int(self.prompts.question("Enter the port details", valid_number=True)),
86
+ "user": self.prompts.question("Enter the user details"),
87
+ "password": self.prompts.question("Enter the password details"),
88
+ },
89
+ }
90
+
91
+ if cred_file.exists():
92
+ backup_filename = cred_file.with_suffix('.bak')
93
+ shutil.copy(cred_file, backup_filename)
94
+ logger.debug(f"Backup of the existing file created at {backup_filename}")
95
+
96
+ with open(cred_file, 'w', encoding='utf-8') as file:
97
+ yaml.dump(credential, file, default_flow_style=False)
98
+
99
+ logger.info(f"Credential template created for {source}.")
100
+ return source
101
+
102
+
103
+ class ConfigureSynapseAssessment(AssessmentConfigurator):
104
+ """Synapse specific assessment configuration."""
105
+
106
+ def _configure_credentials(self) -> str:
107
+ cred_file = self._credential_file
108
+ source = "synapse"
109
+
110
+ logger.info(
111
+ "\n(local | env) \nlocal means values are read as plain text \nenv means values are read "
112
+ "from environment variables fall back to plain text if not variable is not found\n",
113
+ )
114
+ secret_vault_type = str(self.prompts.choice("Enter secret vault type (local | env)", ["local", "env"])).lower()
115
+ secret_vault_name = None
116
+
117
+ # Synapse Workspace Settings
118
+ logger.info("Please provide Synapse Workspace settings:")
119
+ synapse_workspace = {
120
+ "name": self.prompts.question("Enter Synapse workspace name"),
121
+ "dedicated_sql_endpoint": self.prompts.question("Enter dedicated SQL endpoint"),
122
+ "serverless_sql_endpoint": self.prompts.question("Enter serverless SQL endpoint"),
123
+ "sql_user": self.prompts.question("Enter SQL user"),
124
+ "sql_password": self.prompts.question("Enter SQL password"),
125
+ "tz_info": self.prompts.question("Enter timezone (e.g. America/New_York)", default="UTC"),
126
+ }
127
+
128
+ # Azure API Access Settings
129
+ logger.info("Please provide Azure API access settings:")
130
+ azure_api_access = {
131
+ "development_endpoint": self.prompts.question("Enter development endpoint"),
132
+ "azure_client_id": self.prompts.question("Enter Azure client ID"),
133
+ "azure_tenant_id": self.prompts.question("Enter Azure tenant ID"),
134
+ "azure_client_secret": self.prompts.question("Enter Azure client secret"),
135
+ }
136
+
137
+ # JDBC Settings
138
+ logger.info("Please select JDBC authentication type:")
139
+ auth_type = self.prompts.choice(
140
+ "Select authentication type", ["sql_authentication", "ad_passwd_authentication", "spn_authentication"]
141
+ )
142
+
143
+ synapse_jdbc = {
144
+ "auth_type": auth_type,
145
+ "fetch_size": self.prompts.question("Enter fetch size", default="1000"),
146
+ "login_timeout": self.prompts.question("Enter login timeout (seconds)", default="30"),
147
+ }
148
+
149
+ # Profiler Settings
150
+ logger.info("Please configure profiler settings:")
151
+ synapse_profiler = {
152
+ "exclude_serverless_sql_pool": self.prompts.confirm("Exclude serverless SQL pool from profiling?"),
153
+ "exclude_dedicated_sql_pools": self.prompts.confirm("Exclude dedicated SQL pools from profiling?"),
154
+ "exclude_spark_pools": self.prompts.confirm("Exclude Spark pools from profiling?"),
155
+ "exclude_monitoring_metrics": self.prompts.confirm("Exclude monitoring metrics from profiling?"),
156
+ "redact_sql_pools_sql_text": self.prompts.confirm("Redact SQL pools SQL text?"),
157
+ }
158
+
159
+ credential = {
160
+ "secret_vault_type": secret_vault_type,
161
+ "secret_vault_name": secret_vault_name,
162
+ source: {
163
+ "workspace": synapse_workspace,
164
+ "azure_api_access": azure_api_access,
165
+ "jdbc": synapse_jdbc,
166
+ "profiler": synapse_profiler,
167
+ },
168
+ }
169
+
170
+ if cred_file.exists():
171
+ backup_filename = cred_file.with_suffix('.bak')
172
+ shutil.copy(cred_file, backup_filename)
173
+ logger.debug(f"Backup of the existing file created at {backup_filename}")
174
+
175
+ with open(cred_file, 'w', encoding='utf-8') as file:
176
+ yaml.dump(credential, file, default_flow_style=False)
177
+
178
+ logger.info(f"Credential template created for {source}.")
179
+ return source
180
+
181
+
182
+ def create_assessment_configurator(
183
+ source_system: str, product_name: str, prompts: Prompts, credential_file=None
184
+ ) -> AssessmentConfigurator:
185
+ """Factory function to create the appropriate assessment configurator."""
186
+ configurators = {
187
+ "mssql": ConfigureSqlServerAssessment,
188
+ "synapse": ConfigureSynapseAssessment,
189
+ }
190
+
191
+ if source_system not in configurators:
192
+ raise ValueError(f"Unsupported source system: {source_system}")
193
+
194
+ return configurators[source_system](product_name, prompts, credential_file)
@@ -0,0 +1,188 @@
1
+ from pathlib import Path
2
+ from subprocess import run, CalledProcessError
3
+ from dataclasses import dataclass
4
+ from enum import Enum
5
+
6
+ import venv
7
+ import tempfile
8
+ import json
9
+ import logging
10
+ import yaml
11
+ import duckdb
12
+
13
+ from databricks.labs.lakebridge.connections.credential_manager import cred_file
14
+
15
+ from databricks.labs.lakebridge.assessments.profiler_config import PipelineConfig, Step
16
+ from databricks.labs.lakebridge.connections.database_manager import DatabaseManager
17
+
18
+ logger = logging.getLogger(__name__)
19
+ logger.setLevel("INFO")
20
+
21
+ DB_NAME = "profiler_extract.db"
22
+
23
+
24
+ class StepExecutionStatus(str, Enum):
25
+ COMPLETE = "COMPLETE"
26
+ ERROR = "ERROR"
27
+ SKIPPED = "SKIPPED"
28
+
29
+
30
+ @dataclass
31
+ class StepExecutionResult:
32
+ step_name: str
33
+ status: StepExecutionStatus
34
+ error_message: str | None = None
35
+
36
+
37
+ class PipelineClass:
38
+ def __init__(self, config: PipelineConfig, executor: DatabaseManager):
39
+ self.config = config
40
+ self.executor = executor
41
+ self.db_path_prefix = Path(config.extract_folder)
42
+
43
+ def execute(self) -> list[StepExecutionResult]:
44
+ logging.info(f"Pipeline initialized with config: {self.config.name}, version: {self.config.version}")
45
+ execution_results: list[StepExecutionResult] = []
46
+ for step in self.config.steps:
47
+ result = self._process_step(step)
48
+ execution_results.append(result)
49
+ logging.info(f"Step '{step.name}' completed with status: {result.status}")
50
+
51
+ logging.info("Pipeline execution completed")
52
+ return execution_results
53
+
54
+ def _process_step(self, step: Step) -> StepExecutionResult:
55
+ if step.flag != "active":
56
+ logging.info(f"Skipping step: {step.name} as it is not active")
57
+ return StepExecutionResult(step_name=step.name, status=StepExecutionStatus.SKIPPED)
58
+
59
+ logging.debug(f"Executing step: {step.name}")
60
+ try:
61
+ status = self._execute_step(step)
62
+ return StepExecutionResult(step_name=step.name, status=status)
63
+ except RuntimeError as e:
64
+ return StepExecutionResult(step_name=step.name, status=StepExecutionStatus.ERROR, error_message=str(e))
65
+
66
+ def _execute_step(self, step: Step) -> StepExecutionStatus:
67
+ if step.type == "sql":
68
+ logging.info(f"Executing SQL step {step.name}")
69
+ self._execute_sql_step(step)
70
+ return StepExecutionStatus.COMPLETE
71
+ if step.type == "python":
72
+ logging.info(f"Executing Python step {step.name}")
73
+ self._execute_python_step(step)
74
+ return StepExecutionStatus.COMPLETE
75
+ logging.error(f"Unsupported step type: {step.type}")
76
+ raise RuntimeError(f"Unsupported step type: {step.type}")
77
+
78
+ def _execute_sql_step(self, step: Step):
79
+ logging.debug(f"Reading query from file: {step.extract_source}")
80
+ with open(step.extract_source, 'r', encoding='utf-8') as file:
81
+ query = file.read()
82
+
83
+ # Execute the query using the database manager
84
+ logging.info(f"Executing query: {query}")
85
+ try:
86
+ result = self.executor.execute_query(query)
87
+
88
+ # Save the result to duckdb
89
+ self._save_to_db(result, step.name, str(step.mode))
90
+ except Exception as e:
91
+ logging.error(f"SQL execution failed: {str(e)}")
92
+ raise RuntimeError(f"SQL execution failed: {str(e)}") from e
93
+
94
+ def _execute_python_step(self, step: Step):
95
+
96
+ logging.debug(f"Executing Python script: {step.extract_source}")
97
+ db_path = str(self.db_path_prefix / DB_NAME)
98
+ credential_config = str(cred_file("lakebridge"))
99
+
100
+ # Create a temporary directory for the virtual environment
101
+ with tempfile.TemporaryDirectory() as temp_dir:
102
+ venv_dir = Path(temp_dir) / "venv"
103
+ venv.create(venv_dir, with_pip=True)
104
+ venv_python = venv_dir / "bin" / "python"
105
+ venv_pip = venv_dir / "bin" / "pip"
106
+
107
+ logger.info(f"Creating a virtual environment for Python script execution: ${venv_dir}")
108
+ # Install dependencies in the virtual environment
109
+ if step.dependencies:
110
+ logging.info(f"Installing dependencies: {', '.join(step.dependencies)}")
111
+ try:
112
+ logging.debug("Upgrading local pip")
113
+ run([str(venv_pip), "install", "--upgrade", "pip"], check=True, capture_output=True, text=True)
114
+
115
+ run([str(venv_pip), "install", *step.dependencies], check=True, capture_output=True, text=True)
116
+ except CalledProcessError as e:
117
+ logging.error(f"Failed to install dependencies: {e.stderr}")
118
+ raise RuntimeError(f"Failed to install dependencies: {e.stderr}") from e
119
+
120
+ # Execute the Python script using the virtual environment's Python interpreter
121
+ try:
122
+ result = run(
123
+ [
124
+ str(venv_python),
125
+ str(step.extract_source),
126
+ "--db-path",
127
+ db_path,
128
+ "--credential-config-path",
129
+ credential_config,
130
+ ],
131
+ check=True,
132
+ capture_output=True,
133
+ text=True,
134
+ )
135
+
136
+ try:
137
+ output = json.loads(result.stdout)
138
+ if output["status"] == "success":
139
+ logging.info(f"Python script completed: {output['message']}")
140
+ else:
141
+ raise RuntimeError(f"Script reported error: {output['message']}")
142
+ except json.JSONDecodeError:
143
+ logging.info(f"Python script output: {result.stdout}")
144
+
145
+ except CalledProcessError as e:
146
+ error_msg = e.stderr
147
+ logging.error(f"Python script failed: {error_msg}")
148
+ raise RuntimeError(f"Script execution failed: {error_msg}") from e
149
+
150
+ def _save_to_db(self, result, step_name: str, mode: str, batch_size: int = 1000):
151
+ self._create_dir(self.db_path_prefix)
152
+ db_path = str(self.db_path_prefix / DB_NAME)
153
+
154
+ with duckdb.connect(db_path) as conn:
155
+ columns = result.keys()
156
+ # TODO: Add support for figuring out data types from SQLALCHEMY result object result.cursor.description is not reliable
157
+ schema = ' STRING, '.join(columns) + ' STRING'
158
+
159
+ # Handle write modes
160
+ if mode == 'overwrite':
161
+ conn.execute(f"CREATE OR REPLACE TABLE {step_name} ({schema})")
162
+ elif mode == 'append' and step_name not in conn.get_table_names(""):
163
+ conn.execute(f"CREATE TABLE {step_name} ({schema})")
164
+
165
+ # Batch insert using prepared statements
166
+ placeholders = ', '.join(['?' for _ in columns])
167
+ insert_query = f"INSERT INTO {step_name} VALUES ({placeholders})"
168
+
169
+ # Fetch and insert rows in batches
170
+ while True:
171
+ rows = result.fetchmany(batch_size)
172
+ if not rows:
173
+ break
174
+ conn.executemany(insert_query, rows)
175
+
176
+ @staticmethod
177
+ def _create_dir(dir_path: Path):
178
+ if not Path(dir_path).exists():
179
+ dir_path.mkdir(parents=True, exist_ok=True)
180
+
181
+ @staticmethod
182
+ def load_config_from_yaml(file_path: str) -> PipelineConfig:
183
+ with open(file_path, 'r', encoding='utf-8') as file:
184
+ data = yaml.safe_load(file)
185
+ steps = [Step(**step) for step in data['steps']]
186
+ return PipelineConfig(
187
+ name=data['name'], version=data['version'], extract_folder=data['extract_folder'], steps=steps
188
+ )
@@ -0,0 +1,30 @@
1
+ from dataclasses import dataclass, field
2
+
3
+
4
+ @dataclass
5
+ class Step:
6
+ name: str
7
+ type: str | None
8
+ extract_source: str
9
+ mode: str | None
10
+ frequency: str | None
11
+ flag: str | None
12
+ dependencies: list[str] = field(default_factory=list)
13
+ comment: str | None = None
14
+
15
+ def __post_init__(self):
16
+ if self.frequency is None:
17
+ self.frequency = "once"
18
+ if self.flag is None:
19
+ self.flag = "active"
20
+ if self.mode is None:
21
+ self.mode = "append"
22
+
23
+
24
+ @dataclass
25
+ class PipelineConfig:
26
+ name: str
27
+ version: str
28
+ extract_folder: str
29
+ comment: str | None = None
30
+ steps: list[Step] = field(default_factory=list)
@@ -0,0 +1,12 @@
1
+ from databricks.labs.blueprint.logger import install_logger
2
+ from databricks.labs.blueprint.entrypoint import get_logger
3
+ from databricks.sdk.core import with_user_agent_extra
4
+
5
+ install_logger()
6
+ with_user_agent_extra("cmd", "install")
7
+
8
+ if __name__ == "__main__":
9
+ logger = get_logger(__file__)
10
+ logger.setLevel("INFO")
11
+
12
+ logger.info("Successfully Setup Remorph Components Locally")