databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. databricks/__init__.py +3 -0
  2. databricks/labs/__init__.py +3 -0
  3. databricks/labs/lakebridge/__about__.py +2 -0
  4. databricks/labs/lakebridge/__init__.py +11 -0
  5. databricks/labs/lakebridge/assessments/configure_assessment.py +194 -0
  6. databricks/labs/lakebridge/assessments/pipeline.py +188 -0
  7. databricks/labs/lakebridge/assessments/profiler_config.py +30 -0
  8. databricks/labs/lakebridge/base_install.py +12 -0
  9. databricks/labs/lakebridge/cli.py +449 -0
  10. databricks/labs/lakebridge/config.py +192 -0
  11. databricks/labs/lakebridge/connections/__init__.py +0 -0
  12. databricks/labs/lakebridge/connections/credential_manager.py +89 -0
  13. databricks/labs/lakebridge/connections/database_manager.py +98 -0
  14. databricks/labs/lakebridge/connections/env_getter.py +13 -0
  15. databricks/labs/lakebridge/contexts/__init__.py +0 -0
  16. databricks/labs/lakebridge/contexts/application.py +133 -0
  17. databricks/labs/lakebridge/coverage/__init__.py +0 -0
  18. databricks/labs/lakebridge/coverage/commons.py +223 -0
  19. databricks/labs/lakebridge/coverage/lakebridge_snow_transpilation_coverage.py +29 -0
  20. databricks/labs/lakebridge/coverage/local_report.py +9 -0
  21. databricks/labs/lakebridge/coverage/sqlglot_snow_transpilation_coverage.py +5 -0
  22. databricks/labs/lakebridge/coverage/sqlglot_tsql_transpilation_coverage.py +5 -0
  23. databricks/labs/lakebridge/deployment/__init__.py +0 -0
  24. databricks/labs/lakebridge/deployment/configurator.py +199 -0
  25. databricks/labs/lakebridge/deployment/dashboard.py +140 -0
  26. databricks/labs/lakebridge/deployment/installation.py +125 -0
  27. databricks/labs/lakebridge/deployment/job.py +147 -0
  28. databricks/labs/lakebridge/deployment/recon.py +145 -0
  29. databricks/labs/lakebridge/deployment/table.py +30 -0
  30. databricks/labs/lakebridge/deployment/upgrade_common.py +124 -0
  31. databricks/labs/lakebridge/discovery/table.py +36 -0
  32. databricks/labs/lakebridge/discovery/table_definition.py +23 -0
  33. databricks/labs/lakebridge/discovery/tsql_table_definition.py +185 -0
  34. databricks/labs/lakebridge/errors/exceptions.py +1 -0
  35. databricks/labs/lakebridge/helpers/__init__.py +0 -0
  36. databricks/labs/lakebridge/helpers/db_sql.py +24 -0
  37. databricks/labs/lakebridge/helpers/execution_time.py +20 -0
  38. databricks/labs/lakebridge/helpers/file_utils.py +64 -0
  39. databricks/labs/lakebridge/helpers/metastore.py +164 -0
  40. databricks/labs/lakebridge/helpers/recon_config_utils.py +176 -0
  41. databricks/labs/lakebridge/helpers/string_utils.py +62 -0
  42. databricks/labs/lakebridge/helpers/telemetry_utils.py +13 -0
  43. databricks/labs/lakebridge/helpers/validation.py +101 -0
  44. databricks/labs/lakebridge/install.py +849 -0
  45. databricks/labs/lakebridge/intermediate/__init__.py +0 -0
  46. databricks/labs/lakebridge/intermediate/dag.py +88 -0
  47. databricks/labs/lakebridge/intermediate/engine_adapter.py +0 -0
  48. databricks/labs/lakebridge/intermediate/root_tables.py +44 -0
  49. databricks/labs/lakebridge/jvmproxy.py +56 -0
  50. databricks/labs/lakebridge/lineage.py +42 -0
  51. databricks/labs/lakebridge/reconcile/__init__.py +0 -0
  52. databricks/labs/lakebridge/reconcile/compare.py +414 -0
  53. databricks/labs/lakebridge/reconcile/connectors/__init__.py +0 -0
  54. databricks/labs/lakebridge/reconcile/connectors/data_source.py +72 -0
  55. databricks/labs/lakebridge/reconcile/connectors/databricks.py +87 -0
  56. databricks/labs/lakebridge/reconcile/connectors/jdbc_reader.py +41 -0
  57. databricks/labs/lakebridge/reconcile/connectors/oracle.py +108 -0
  58. databricks/labs/lakebridge/reconcile/connectors/secrets.py +30 -0
  59. databricks/labs/lakebridge/reconcile/connectors/snowflake.py +173 -0
  60. databricks/labs/lakebridge/reconcile/connectors/source_adapter.py +30 -0
  61. databricks/labs/lakebridge/reconcile/connectors/sql_server.py +132 -0
  62. databricks/labs/lakebridge/reconcile/constants.py +37 -0
  63. databricks/labs/lakebridge/reconcile/exception.py +42 -0
  64. databricks/labs/lakebridge/reconcile/execute.py +920 -0
  65. databricks/labs/lakebridge/reconcile/query_builder/__init__.py +0 -0
  66. databricks/labs/lakebridge/reconcile/query_builder/aggregate_query.py +293 -0
  67. databricks/labs/lakebridge/reconcile/query_builder/base.py +138 -0
  68. databricks/labs/lakebridge/reconcile/query_builder/count_query.py +33 -0
  69. databricks/labs/lakebridge/reconcile/query_builder/expression_generator.py +292 -0
  70. databricks/labs/lakebridge/reconcile/query_builder/hash_query.py +91 -0
  71. databricks/labs/lakebridge/reconcile/query_builder/sampling_query.py +123 -0
  72. databricks/labs/lakebridge/reconcile/query_builder/threshold_query.py +231 -0
  73. databricks/labs/lakebridge/reconcile/recon_capture.py +635 -0
  74. databricks/labs/lakebridge/reconcile/recon_config.py +363 -0
  75. databricks/labs/lakebridge/reconcile/recon_output_config.py +85 -0
  76. databricks/labs/lakebridge/reconcile/runner.py +97 -0
  77. databricks/labs/lakebridge/reconcile/sampler.py +239 -0
  78. databricks/labs/lakebridge/reconcile/schema_compare.py +126 -0
  79. databricks/labs/lakebridge/resources/__init__.py +0 -0
  80. databricks/labs/lakebridge/resources/config/credentials.yml +33 -0
  81. databricks/labs/lakebridge/resources/reconcile/__init__.py +0 -0
  82. databricks/labs/lakebridge/resources/reconcile/dashboards/__init__.py +0 -0
  83. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/00_0_aggregate_recon_header.md +6 -0
  84. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  85. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_1_executed_by.filter.yml +5 -0
  86. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/01_2_started_at.filter.yml +5 -0
  87. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  88. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_1_source_table.filter.yml +5 -0
  89. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/02_2_target_table.filter.yml +5 -0
  90. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/04_0_aggregate_summary_table.sql +46 -0
  91. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/05_0_aggregate_recon_drilldown_header.md +2 -0
  92. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_0_recon_id.filter.yml +5 -0
  93. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_1_category.filter.yml +5 -0
  94. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/06_2_aggregate_type.filter.yml +5 -0
  95. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_0_target_table.filter.yml +4 -0
  96. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/07_1_source_table.filter.yml +4 -0
  97. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/08_0_aggregate_details_table.sql +92 -0
  98. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/09_0_aggregate_missing_mismatch_header.md +1 -0
  99. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/10_0_aggr_mismatched_records.sql +19 -0
  100. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_0_aggr_missing_in_databricks.sql +19 -0
  101. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/11_1_aggr_missing_in_source.sql +19 -0
  102. databricks/labs/lakebridge/resources/reconcile/dashboards/aggregate_reconciliation_metrics/dashboard.yml +365 -0
  103. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/00_0_recon_main.md +3 -0
  104. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_0_recon_id.filter.yml +6 -0
  105. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_1_report_type.filter.yml +5 -0
  106. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/01_2_executed_by.filter.yml +5 -0
  107. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_0_source_type.filter.yml +5 -0
  108. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_1_source_table.filter.yml +6 -0
  109. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/02_2_target_table.filter.yml +6 -0
  110. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/03_0_started_at.filter.yml +5 -0
  111. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/05_0_summary_table.sql +38 -0
  112. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/06_0_schema_comparison_header.md +3 -0
  113. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/07_0_schema_details_table.sql +42 -0
  114. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/08_0_drill_down_header.md +3 -0
  115. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_0_recon_id.filter.yml +4 -0
  116. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/09_1_category.filter.yml +4 -0
  117. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_0_target_table.filter.yml +4 -0
  118. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/10_1_source_table.filter.yml +4 -0
  119. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/11_0_recon_details_pivot.sql +40 -0
  120. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/12_0_daily_data_validation_issue_header.md +3 -0
  121. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/13_0_success_fail_.filter.yml +4 -0
  122. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/14_0_failed_recon_ids.sql +15 -0
  123. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_0_total_failed_runs.sql +10 -0
  124. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_1_failed_targets.sql +10 -0
  125. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/15_2_successful_targets.sql +10 -0
  126. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/16_0_missing_mismatch_header.md +1 -0
  127. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_0_mismatched_records.sql +14 -0
  128. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/17_1_threshold_mismatches.sql +14 -0
  129. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_0_missing_in_databricks.sql +14 -0
  130. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/18_1_missing_in_source.sql +14 -0
  131. databricks/labs/lakebridge/resources/reconcile/dashboards/reconciliation_metrics/dashboard.yml +545 -0
  132. databricks/labs/lakebridge/resources/reconcile/queries/__init__.py +0 -0
  133. databricks/labs/lakebridge/resources/reconcile/queries/installation/__init__.py +0 -0
  134. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_details.sql +7 -0
  135. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_metrics.sql +15 -0
  136. databricks/labs/lakebridge/resources/reconcile/queries/installation/aggregate_rules.sql +6 -0
  137. databricks/labs/lakebridge/resources/reconcile/queries/installation/details.sql +7 -0
  138. databricks/labs/lakebridge/resources/reconcile/queries/installation/main.sql +24 -0
  139. databricks/labs/lakebridge/resources/reconcile/queries/installation/metrics.sql +21 -0
  140. databricks/labs/lakebridge/transpiler/__init__.py +0 -0
  141. databricks/labs/lakebridge/transpiler/execute.py +423 -0
  142. databricks/labs/lakebridge/transpiler/lsp/__init__.py +0 -0
  143. databricks/labs/lakebridge/transpiler/lsp/lsp_engine.py +564 -0
  144. databricks/labs/lakebridge/transpiler/sqlglot/__init__.py +0 -0
  145. databricks/labs/lakebridge/transpiler/sqlglot/dialect_utils.py +30 -0
  146. databricks/labs/lakebridge/transpiler/sqlglot/generator/__init__.py +0 -0
  147. databricks/labs/lakebridge/transpiler/sqlglot/generator/databricks.py +771 -0
  148. databricks/labs/lakebridge/transpiler/sqlglot/lca_utils.py +138 -0
  149. databricks/labs/lakebridge/transpiler/sqlglot/local_expression.py +197 -0
  150. databricks/labs/lakebridge/transpiler/sqlglot/parsers/__init__.py +0 -0
  151. databricks/labs/lakebridge/transpiler/sqlglot/parsers/oracle.py +23 -0
  152. databricks/labs/lakebridge/transpiler/sqlglot/parsers/presto.py +202 -0
  153. databricks/labs/lakebridge/transpiler/sqlglot/parsers/snowflake.py +535 -0
  154. databricks/labs/lakebridge/transpiler/sqlglot/sqlglot_engine.py +203 -0
  155. databricks/labs/lakebridge/transpiler/transpile_engine.py +49 -0
  156. databricks/labs/lakebridge/transpiler/transpile_status.py +68 -0
  157. databricks/labs/lakebridge/uninstall.py +28 -0
  158. databricks/labs/lakebridge/upgrades/v0.4.0_add_main_table_operation_name_column.py +80 -0
  159. databricks/labs/lakebridge/upgrades/v0.6.0_alter_metrics_datatype.py +51 -0
  160. databricks_labs_lakebridge-0.10.0.dist-info/METADATA +58 -0
  161. databricks_labs_lakebridge-0.10.0.dist-info/RECORD +171 -0
  162. databricks_labs_lakebridge-0.10.0.dist-info/WHEEL +4 -0
  163. databricks_labs_lakebridge-0.10.0.dist-info/entry_points.txt +2 -0
  164. databricks_labs_lakebridge-0.10.0.dist-info/licenses/LICENSE +69 -0
  165. databricks_labs_lakebridge-0.10.0.dist-info/licenses/NOTICE +42 -0
  166. docs/lakebridge/src/components/Button.tsx +81 -0
  167. docs/lakebridge/src/css/custom.css +167 -0
  168. docs/lakebridge/src/css/table.css +20 -0
  169. docs/lakebridge/src/pages/index.tsx +57 -0
  170. docs/lakebridge/src/theme/Footer/index.tsx +24 -0
  171. docs/lakebridge/src/theme/Layout/index.tsx +18 -0
File without changes
@@ -0,0 +1,88 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger(__name__)
4
+
5
+
6
+ class Node:
7
+ def __init__(self, name: str):
8
+ self.name = name.lower()
9
+ self.children: list[str] = []
10
+ self.parents: list[str] = []
11
+
12
+ def add_parent(self, node: str) -> None:
13
+ self.parents.append(node)
14
+
15
+ def add_child(self, node: str) -> None:
16
+ self.children.append(node)
17
+
18
+ def __repr__(self) -> str:
19
+ return f"Node({self.name}, {self.children})"
20
+
21
+
22
+ class DAG:
23
+ def __init__(self):
24
+ self.nodes: dict[str, Node] = {}
25
+
26
+ def add_node(self, node_name: str) -> None:
27
+ if node_name not in self.nodes and node_name not in {None, "none"}:
28
+ self.nodes[node_name.lower()] = Node(node_name.lower())
29
+
30
+ def add_edge(self, parent_name: str, child_name: str) -> None:
31
+ parent_name = parent_name.lower() if parent_name is not None else None
32
+ child_name = child_name.lower() if child_name is not None else None
33
+ logger.debug(f"Adding edge: {parent_name} -> {child_name}")
34
+ if parent_name not in self.nodes:
35
+ self.add_node(parent_name)
36
+ if child_name not in self.nodes:
37
+ self.add_node(child_name)
38
+
39
+ if child_name is not None:
40
+ self.nodes[parent_name].add_child(child_name)
41
+ self.nodes[child_name].add_parent(parent_name)
42
+
43
+ def identify_immediate_parents(self, table_name: str) -> list[str]:
44
+ table_name = table_name.lower() # convert to lower() case
45
+ if table_name not in self.nodes:
46
+ logger.debug(f"Table with the name {table_name} not found in the DAG")
47
+ return []
48
+
49
+ return list(self.nodes[table_name].parents)
50
+
51
+ def identify_immediate_children(self, table_name: str) -> list[str]:
52
+ table_name = table_name.lower() # convert to lower() case
53
+ if table_name not in self.nodes:
54
+ logger.debug(f"Table with the name {table_name} not found in the DAG")
55
+ return []
56
+
57
+ return list(self.nodes[table_name].children)
58
+
59
+ def _is_root_node(self, node_name: str) -> bool:
60
+ return len(self.identify_immediate_parents(node_name)) == 0
61
+
62
+ def walk_bfs(self, node: Node, level: int) -> set:
63
+ tables_at_level = set()
64
+ queue = [(node, 0)] # The queue for the BFS. Each element is a tuple (node, level).
65
+ while queue:
66
+ current_node, node_level = queue.pop(0)
67
+
68
+ if node_level == level:
69
+ tables_at_level.add(current_node.name)
70
+ elif node_level > level:
71
+ break
72
+
73
+ for child_name in self.identify_immediate_children(current_node.name):
74
+ queue.append((self.nodes[child_name], node_level + 1))
75
+ return tables_at_level
76
+
77
+ def identify_root_tables(self, level: int) -> set:
78
+ all_nodes = set(self.nodes.values())
79
+ root_tables_at_level = set()
80
+
81
+ for node in all_nodes:
82
+ if self._is_root_node(node.name):
83
+ root_tables_at_level.update(self.walk_bfs(node, level))
84
+
85
+ return root_tables_at_level
86
+
87
+ def __repr__(self) -> str:
88
+ return str({node_name: str(node) for node_name, node in self.nodes.items()})
@@ -0,0 +1,44 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from databricks.labs.lakebridge.helpers.file_utils import (
5
+ get_sql_file,
6
+ is_sql_file,
7
+ read_file,
8
+ )
9
+ from databricks.labs.lakebridge.intermediate.dag import DAG
10
+
11
+ from databricks.labs.lakebridge.transpiler.sqlglot.sqlglot_engine import SqlglotEngine
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class RootTableAnalyzer:
17
+
18
+ def __init__(self, engine: SqlglotEngine, source_dialect: str, input_path: Path):
19
+ self.engine = engine
20
+ self.source_dialect = source_dialect
21
+ self.input_path = input_path
22
+
23
+ def generate_lineage_dag(self) -> DAG:
24
+ dag = DAG()
25
+
26
+ # when input is sql file then parse the file
27
+ if is_sql_file(self.input_path):
28
+ logger.debug(f"Generating Lineage file: {self.input_path}")
29
+ sql_content = read_file(self.input_path)
30
+ self._populate_dag(sql_content, self.input_path, dag)
31
+ return dag # return after processing the file
32
+
33
+ # when the input is a directory
34
+ for path in get_sql_file(self.input_path):
35
+ logger.debug(f"Generating Lineage file: {path}")
36
+ sql_content = read_file(path)
37
+ self._populate_dag(sql_content, path, dag)
38
+
39
+ return dag
40
+
41
+ def _populate_dag(self, sql_content: str, path: Path, dag: DAG):
42
+ for root_table, child in self.engine.analyse_table_lineage(self.source_dialect, sql_content, path):
43
+ dag.add_node(child)
44
+ dag.add_edge(root_table, child)
@@ -0,0 +1,56 @@
1
+ import logging
2
+ import os
3
+ import sys
4
+ import subprocess
5
+
6
+ from databricks.labs.blueprint.entrypoint import find_project_root
7
+ from databricks.labs.blueprint.cli import App
8
+
9
+
10
+ def proxy_command(app: App, command: str):
11
+ def fn(**_):
12
+ proxy = JvmProxy()
13
+ proxy.run()
14
+
15
+ fn.__name__ = command
16
+ fn.__doc__ = f"Proxy to run {command} in JVM"
17
+ app.command(fn)
18
+
19
+
20
+ class JvmProxy:
21
+ # TODO Refactor this class to use LSP protocol instead
22
+ def __init__(self):
23
+ self._root = find_project_root(__file__)
24
+ databricks_logger = logging.getLogger("databricks")
25
+ self._debug = databricks_logger.level == logging.DEBUG
26
+
27
+ def _recompile(self):
28
+ subprocess.run(
29
+ ["mvn", "compile", "-f", f'{self._root}/pom.xml'],
30
+ stdout=sys.stdout,
31
+ stderr=sys.stderr,
32
+ check=True,
33
+ )
34
+
35
+ def run(self):
36
+ if self._debug:
37
+ self._recompile()
38
+ classpath = self._root / 'core/target/classpath.txt'
39
+ classes = self._root / 'core/target/scala-2.12/classes'
40
+ # TODO: use the os-specific path separator
41
+ args = [
42
+ "java",
43
+ "--class-path",
44
+ f'{classes.as_posix()}:{classpath.read_text()}',
45
+ "com.databricks.labs.lakebridge.Main",
46
+ sys.argv[1],
47
+ ]
48
+ with subprocess.Popen(
49
+ args,
50
+ stdin=sys.stdin,
51
+ stdout=sys.stdout,
52
+ stderr=sys.stderr,
53
+ env=os.environ.copy(),
54
+ text=True,
55
+ ) as process:
56
+ return process.wait()
@@ -0,0 +1,42 @@
1
+ import datetime
2
+ import logging
3
+ from pathlib import Path
4
+
5
+ from databricks.labs.lakebridge.intermediate.dag import DAG
6
+ from databricks.labs.lakebridge.intermediate.root_tables import RootTableAnalyzer
7
+ from databricks.labs.lakebridge.transpiler.sqlglot.sqlglot_engine import SqlglotEngine
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def _generate_dot_file_contents(dag: DAG) -> str:
13
+ _lineage_str = "flowchart TD\n"
14
+ for node_name, node in dag.nodes.items():
15
+ if node.parents:
16
+ for parent in node.parents:
17
+ _lineage_str += f" {node_name.capitalize()} --> {parent.capitalize()}\n"
18
+ else:
19
+ # Include nodes without parents to ensure they appear in the diagram
20
+ _lineage_str += f" {node_name.capitalize()}\n"
21
+ return _lineage_str
22
+
23
+
24
+ def lineage_generator(engine: SqlglotEngine, source_dialect: str, input_source: str, output_folder: str):
25
+ input_sql_path = Path(input_source)
26
+ output_folder = output_folder if output_folder.endswith('/') else output_folder + '/'
27
+
28
+ msg = f"Processing for SQLs at this location: {input_sql_path}"
29
+ logger.info(msg)
30
+ root_table_analyzer = RootTableAnalyzer(engine, source_dialect, input_sql_path)
31
+ generated_dag = root_table_analyzer.generate_lineage_dag()
32
+ lineage_file_content = _generate_dot_file_contents(generated_dag)
33
+
34
+ date_str = datetime.datetime.now().strftime("%d%m%y")
35
+
36
+ output_filename = Path(f"{output_folder}lineage_{date_str}.dot")
37
+ if output_filename.exists():
38
+ logger.warning(f'The output file already exists and will be replaced: {output_filename}')
39
+ logger.info(f"Attempting to write the lineage to {output_filename}")
40
+ with output_filename.open('w', encoding='utf-8') as f:
41
+ f.write(lineage_file_content)
42
+ logger.info(f"Succeeded to write the lineage to {output_filename}")
File without changes
@@ -0,0 +1,414 @@
1
+ import logging
2
+ from functools import reduce
3
+ from pyspark.sql import DataFrame, SparkSession
4
+ from pyspark.sql.functions import col, expr, lit
5
+
6
+ from databricks.labs.lakebridge.reconcile.exception import ColumnMismatchException
7
+ from databricks.labs.lakebridge.reconcile.recon_capture import (
8
+ ReconIntermediatePersist,
9
+ )
10
+ from databricks.labs.lakebridge.reconcile.recon_output_config import (
11
+ DataReconcileOutput,
12
+ MismatchOutput,
13
+ )
14
+ from databricks.labs.lakebridge.reconcile.recon_config import (
15
+ AggregateRule,
16
+ ColumnMapping,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ _HASH_COLUMN_NAME = "hash_value_recon"
22
+ _SAMPLE_ROWS = 50
23
+
24
+
25
+ def raise_column_mismatch_exception(msg: str, source_missing: list[str], target_missing: list[str]) -> Exception:
26
+ error_msg = (
27
+ f"{msg}\n"
28
+ f"columns missing in source: {','.join(source_missing) if source_missing else None}\n"
29
+ f"columns missing in target: {','.join(target_missing) if target_missing else None}\n"
30
+ )
31
+ return ColumnMismatchException(error_msg)
32
+
33
+
34
+ def _generate_join_condition(source_alias, target_alias, key_columns):
35
+ conditions = [
36
+ col(f"{source_alias}.{key_column}").eqNullSafe(col(f"{target_alias}.{key_column}"))
37
+ for key_column in key_columns
38
+ ]
39
+ return reduce(lambda a, b: a & b, conditions)
40
+
41
+
42
+ def reconcile_data(
43
+ source: DataFrame,
44
+ target: DataFrame,
45
+ key_columns: list[str],
46
+ report_type: str,
47
+ spark: SparkSession,
48
+ path: str,
49
+ ) -> DataReconcileOutput:
50
+ source_alias = "src"
51
+ target_alias = "tgt"
52
+ if report_type not in {"data", "all"}:
53
+ key_columns = [_HASH_COLUMN_NAME]
54
+ df = (
55
+ source.alias(source_alias)
56
+ .join(
57
+ other=target.alias(target_alias),
58
+ on=_generate_join_condition(source_alias, target_alias, key_columns),
59
+ how="full",
60
+ )
61
+ .selectExpr(
62
+ *[f'{source_alias}.{col_name} as {source_alias}_{col_name}' for col_name in source.columns],
63
+ *[f'{target_alias}.{col_name} as {target_alias}_{col_name}' for col_name in target.columns],
64
+ )
65
+ )
66
+
67
+ # Write unmatched df to volume
68
+ df = ReconIntermediatePersist(spark, path).write_and_read_unmatched_df_with_volumes(df)
69
+ logger.warning(f"Unmatched data is written to {path} successfully")
70
+
71
+ mismatch = _get_mismatch_data(df, source_alias, target_alias) if report_type in {"all", "data"} else None
72
+
73
+ missing_in_src = (
74
+ df.filter(col(f"{source_alias}_{_HASH_COLUMN_NAME}").isNull())
75
+ .select(
76
+ *[
77
+ col(col_name).alias(col_name.replace(f'{target_alias}_', '').lower())
78
+ for col_name in df.columns
79
+ if col_name.startswith(f'{target_alias}_')
80
+ ]
81
+ )
82
+ .drop(_HASH_COLUMN_NAME)
83
+ )
84
+
85
+ missing_in_tgt = (
86
+ df.filter(col(f"{target_alias}_{_HASH_COLUMN_NAME}").isNull())
87
+ .select(
88
+ *[
89
+ col(col_name).alias(col_name.replace(f'{source_alias}_', '').lower())
90
+ for col_name in df.columns
91
+ if col_name.startswith(f'{source_alias}_')
92
+ ]
93
+ )
94
+ .drop(_HASH_COLUMN_NAME)
95
+ )
96
+ mismatch_count = 0
97
+ if mismatch:
98
+ mismatch_count = mismatch.count()
99
+
100
+ missing_in_src_count = missing_in_src.count()
101
+ missing_in_tgt_count = missing_in_tgt.count()
102
+
103
+ return DataReconcileOutput(
104
+ mismatch_count=mismatch_count,
105
+ missing_in_src_count=missing_in_src_count,
106
+ missing_in_tgt_count=missing_in_tgt_count,
107
+ missing_in_src=missing_in_src.limit(_SAMPLE_ROWS),
108
+ missing_in_tgt=missing_in_tgt.limit(_SAMPLE_ROWS),
109
+ mismatch=MismatchOutput(mismatch_df=mismatch),
110
+ )
111
+
112
+
113
+ def _get_mismatch_data(df: DataFrame, src_alias: str, tgt_alias: str) -> DataFrame:
114
+ return (
115
+ df.filter(
116
+ (col(f"{src_alias}_{_HASH_COLUMN_NAME}").isNotNull())
117
+ & (col(f"{tgt_alias}_{_HASH_COLUMN_NAME}").isNotNull())
118
+ )
119
+ .withColumn(
120
+ "hash_match",
121
+ col(f"{src_alias}_{_HASH_COLUMN_NAME}") == col(f"{tgt_alias}_{_HASH_COLUMN_NAME}"),
122
+ )
123
+ .filter(col("hash_match") == lit(False))
124
+ .select(
125
+ *[
126
+ col(col_name).alias(col_name.replace(f'{src_alias}_', '').lower())
127
+ for col_name in df.columns
128
+ if col_name.startswith(f'{src_alias}_')
129
+ ]
130
+ )
131
+ .drop(_HASH_COLUMN_NAME)
132
+ )
133
+
134
+
135
+ def _convert_columns_to_lowercase(df: DataFrame) -> DataFrame:
136
+ lowercased_columns = [col(column).alias(column.lower()) for column in df.columns]
137
+ return df.select(*lowercased_columns)
138
+
139
+
140
+ def capture_mismatch_data_and_columns(source: DataFrame, target: DataFrame, key_columns: list[str]) -> MismatchOutput:
141
+ source_df = _convert_columns_to_lowercase(source)
142
+ target_df = _convert_columns_to_lowercase(target)
143
+
144
+ source_columns = source_df.columns
145
+ target_columns = target_df.columns
146
+
147
+ if source_columns != target_columns:
148
+ message = "source and target should have same columns for capturing the mismatch data"
149
+ source_missing = [column for column in target_columns if column not in source_columns]
150
+ target_missing = [column for column in source_columns if column not in target_columns]
151
+ raise raise_column_mismatch_exception(message, source_missing, target_missing)
152
+
153
+ check_columns = [column for column in source_columns if column not in key_columns]
154
+ mismatch_df = _get_mismatch_df(source_df, target_df, key_columns, check_columns)
155
+ mismatch_columns = _get_mismatch_columns(mismatch_df, check_columns)
156
+ return MismatchOutput(mismatch_df, mismatch_columns)
157
+
158
+
159
+ def _get_mismatch_columns(df: DataFrame, columns: list[str]):
160
+ # Collect the DataFrame to a local variable
161
+ local_df = df.collect()
162
+ mismatch_columns = []
163
+ for column in columns:
164
+ # Check if any row has False in the column
165
+ if any(not row[column + "_match"] for row in local_df):
166
+ mismatch_columns.append(column)
167
+ return mismatch_columns
168
+
169
+
170
+ def _get_mismatch_df(source: DataFrame, target: DataFrame, key_columns: list[str], column_list: list[str]):
171
+ source_aliased = [col('base.' + column).alias(column + '_base') for column in column_list]
172
+ target_aliased = [col('compare.' + column).alias(column + '_compare') for column in column_list]
173
+
174
+ match_expr = [expr(f"{column}_base=={column}_compare").alias(column + "_match") for column in column_list]
175
+ key_cols = [col(column) for column in key_columns]
176
+ select_expr = key_cols + source_aliased + target_aliased + match_expr
177
+
178
+ filter_columns = " and ".join([column + "_match" for column in column_list])
179
+ filter_expr = ~expr(filter_columns)
180
+
181
+ logger.info(f"KEY COLUMNS: {key_columns}")
182
+ logger.info(f"FILTER COLUMNS: {filter_expr}")
183
+ logger.info(f"SELECT COLUMNS: {select_expr}")
184
+
185
+ mismatch_df = (
186
+ source.alias('base').join(other=target.alias('compare'), on=key_columns, how="inner").select(*select_expr)
187
+ )
188
+
189
+ compare_columns = [column for column in mismatch_df.columns if column not in key_columns]
190
+ return mismatch_df.select(*key_columns + sorted(compare_columns))
191
+
192
+
193
+ def alias_column_str(alias: str, columns: list[str]) -> list[str]:
194
+ return [f"{alias}.{column}" for column in columns]
195
+
196
+
197
+ def _generate_agg_join_condition(source_alias: str, target_alias: str, key_columns: list[str]):
198
+ join_columns: list[ColumnMapping] = [
199
+ ColumnMapping(source_name=f"source_group_by_{key_col}", target_name=f"target_group_by_{key_col}")
200
+ for key_col in key_columns
201
+ ]
202
+ conditions = [
203
+ col(f"{source_alias}.{mapping.source_name}").eqNullSafe(col(f"{target_alias}.{mapping.target_name}"))
204
+ for mapping in join_columns
205
+ ]
206
+ return reduce(lambda a, b: a & b, conditions)
207
+
208
+
209
+ def _agg_conditions(
210
+ cols: list[ColumnMapping] | None,
211
+ condition_type: str = "group_filter",
212
+ op_type: str = "and",
213
+ ):
214
+ """
215
+ Generate conditions for aggregated data comparison based on the condition type
216
+ and reduces it based on the operator (and, or)
217
+
218
+ e.g., cols = [(source_min_col1, target_min_col1)]
219
+ 1. condition_type = "group_filter"
220
+ source_group_by_col1 is not null and target_group_by_col1 is not null
221
+ 2. condition_type = "select"
222
+ source_min_col1 == target_min_col1
223
+ 3. condition_type = "missing_in_src"
224
+ source_min_col1 is null
225
+ 4. condition_type = "missing_in_tgt"
226
+ target_min_col1 is null
227
+
228
+ :param cols: List of columns to compare
229
+ :param condition_type: Type of condition to generate
230
+ :param op_type: and, or
231
+ :return: Reduced column expressions
232
+ """
233
+ assert cols, "Columns must be specified for aggregation conditions"
234
+
235
+ if condition_type == "group_filter":
236
+ conditions_list = [
237
+ (col(f"{mapping.source_name}").isNotNull() & col(f"{mapping.target_name}").isNotNull()) for mapping in cols
238
+ ]
239
+ elif condition_type == "select":
240
+ conditions_list = [col(f"{mapping.source_name}") == col(f"{mapping.target_name}") for mapping in cols]
241
+ elif condition_type == "missing_in_src":
242
+ conditions_list = [col(f"{mapping.source_name}").isNull() for mapping in cols]
243
+ elif condition_type == "missing_in_tgt":
244
+ conditions_list = [col(f"{mapping.target_name}").isNull() for mapping in cols]
245
+ else:
246
+ raise ValueError(f"Invalid condition type: {condition_type}")
247
+
248
+ return reduce(lambda a, b: a & b if op_type == "and" else a | b, conditions_list)
249
+
250
+
251
+ def _generate_match_columns(select_cols: list[ColumnMapping]):
252
+ """
253
+ Generate match columns for the given select columns
254
+ e.g., select_cols = [(source_min_col1, target_min_col1), (source_count_col3, target_count_col3)]
255
+ |--------------------------------------|---------------------|
256
+ | match_min_col1 | match_count_col3 |
257
+ |--------------------------------------|--------------------|
258
+ source_min_col1 == target_min_col1 | source_count_col3 == target_count_col3
259
+ --------------------------------------|---------------------|
260
+
261
+ :param select_cols:
262
+ :return:
263
+ """
264
+ items = []
265
+ for mapping in select_cols:
266
+ match_col_name = mapping.source_name.replace("source_", "match_")
267
+ items.append((match_col_name, col(f"{mapping.source_name}") == col(f"{mapping.target_name}")))
268
+ return items
269
+
270
+
271
+ def _get_mismatch_agg_data(
272
+ df: DataFrame,
273
+ select_cols: list[ColumnMapping],
274
+ group_cols: list[ColumnMapping] | None,
275
+ ) -> DataFrame:
276
+ # TODO: Integrate with _get_mismatch_data function
277
+ """
278
+ For each rule select columns, generate a match column to compare the aggregated data between Source and Target
279
+
280
+ e.g., select_cols = [(source_min_col1, target_min_col1), (source_count_col3, target_count_col3)]
281
+
282
+ source_min_col1 | target_min_col1 | match_min_col1 | agg_data_match |
283
+ -----------------|--------------------|----------------|-------------------|
284
+ 11 | 12 |source_min_col1 == target_min_col1 | False |
285
+
286
+ :param df: Joined DataFrame with aggregated data from Source and Target
287
+ :param select_cols: Rule specific select columns
288
+ :param group_cols: Rule specific group by columns, if any
289
+ :return: DataFrame with match_<AGG_TYPE>_<COLUMN> and agg_data_match columns
290
+ to identify the aggregate data mismatch between Source and Target
291
+ """
292
+ df_with_match_cols = df
293
+
294
+ if group_cols:
295
+ # Filter Conditions are in the format of: source_group_by_col1 is not null and target_group_by_col1 is not null
296
+ filter_conditions = _agg_conditions(group_cols)
297
+ df_with_match_cols = df_with_match_cols.filter(filter_conditions)
298
+
299
+ # Generate match columns for the select columns. e.g., match_<AGG_TYPE>_<COLUMN>
300
+ for match_column_name, match_column in _generate_match_columns(select_cols):
301
+ df_with_match_cols = df_with_match_cols.withColumn(match_column_name, match_column)
302
+
303
+ # e.g., source_min_col1 == target_min_col1 and source_count_col3 == target_count_col3
304
+ select_conditions = _agg_conditions(select_cols, "select")
305
+
306
+ return df_with_match_cols.withColumn("agg_data_match", select_conditions).filter(
307
+ col("agg_data_match") == lit(False)
308
+ )
309
+
310
+
311
+ def reconcile_agg_data_per_rule(
312
+ joined_df: DataFrame,
313
+ source_columns: list[str],
314
+ target_columns: list[str],
315
+ rule: AggregateRule,
316
+ ) -> DataReconcileOutput:
317
+ """ "
318
+ Generates the reconciliation output for the given rule
319
+ """
320
+ # Generates select columns in the format of:
321
+ # [(source_min_col1, target_min_col1), (source_count_col3, target_count_col3) ... ]
322
+
323
+ rule_select_columns = [
324
+ ColumnMapping(
325
+ source_name=f"source_{rule.agg_type}_{rule.agg_column}",
326
+ target_name=f"target_{rule.agg_type}_{rule.agg_column}",
327
+ )
328
+ ]
329
+
330
+ rule_group_columns = None
331
+ if rule.group_by_columns:
332
+ rule_group_columns = [
333
+ ColumnMapping(source_name=f"source_group_by_{group_col}", target_name=f"target_group_by_{group_col}")
334
+ for group_col in rule.group_by_columns
335
+ ]
336
+ rule_select_columns.extend(rule_group_columns)
337
+
338
+ df_rule_columns = []
339
+ for mapping in rule_select_columns:
340
+ df_rule_columns.extend([mapping.source_name, mapping.target_name])
341
+
342
+ joined_df_with_rule_cols = joined_df.selectExpr(*df_rule_columns)
343
+
344
+ # Data mismatch between Source and Target aggregated data
345
+ mismatch = _get_mismatch_agg_data(joined_df_with_rule_cols, rule_select_columns, rule_group_columns)
346
+
347
+ # Data missing in Source DataFrame
348
+ rule_target_columns = set(target_columns).intersection([mapping.target_name for mapping in rule_select_columns])
349
+
350
+ missing_in_src = joined_df_with_rule_cols.filter(_agg_conditions(rule_select_columns, "missing_in_src")).select(
351
+ *rule_target_columns
352
+ )
353
+
354
+ # Data missing in Target DataFrame
355
+ rule_source_columns = set(source_columns).intersection([mapping.source_name for mapping in rule_select_columns])
356
+ missing_in_tgt = joined_df_with_rule_cols.filter(_agg_conditions(rule_select_columns, "missing_in_tgt")).select(
357
+ *rule_source_columns
358
+ )
359
+
360
+ mismatch_count = 0
361
+ if mismatch:
362
+ mismatch_count = mismatch.count()
363
+
364
+ rule_reconcile_output = DataReconcileOutput(
365
+ mismatch_count=mismatch_count,
366
+ missing_in_src_count=missing_in_src.count(),
367
+ missing_in_tgt_count=missing_in_tgt.count(),
368
+ missing_in_src=missing_in_src.limit(_SAMPLE_ROWS),
369
+ missing_in_tgt=missing_in_tgt.limit(_SAMPLE_ROWS),
370
+ mismatch=MismatchOutput(mismatch_df=mismatch),
371
+ )
372
+
373
+ return rule_reconcile_output
374
+
375
+
376
+ def join_aggregate_data(
377
+ source: DataFrame,
378
+ target: DataFrame,
379
+ key_columns: list[str] | None,
380
+ spark: SparkSession,
381
+ path: str,
382
+ ) -> DataFrame:
383
+ # TODO: Integrate with reconcile_data function
384
+
385
+ source_alias = "src"
386
+ target_alias = "tgt"
387
+
388
+ # Generates group by columns in the format of:
389
+ # [(source_group_by_col1, target_group_by_col1), (source_group_by_col2, target_group_by_col2) ... ]
390
+
391
+ if key_columns:
392
+ # If there are Group By columns, do Full join on the grouped columns
393
+ df = source.alias(source_alias).join(
394
+ other=target.alias(target_alias),
395
+ on=_generate_agg_join_condition(source_alias, target_alias, key_columns),
396
+ how="full",
397
+ )
398
+ else:
399
+ # If there is no Group By condition, do Cross join as there is only one record
400
+ df = source.alias(source_alias).join(
401
+ other=target.alias(target_alias),
402
+ how="cross",
403
+ )
404
+
405
+ joined_df = df.selectExpr(
406
+ *source.columns,
407
+ *target.columns,
408
+ )
409
+
410
+ # Write the joined df to volume path
411
+ joined_volume_df = ReconIntermediatePersist(spark, path).write_and_read_unmatched_df_with_volumes(joined_df).cache()
412
+ logger.warning(f"Unmatched data is written to {path} successfully")
413
+
414
+ return joined_volume_df