PyPI - databricks-labs-lakebridge - Versions diffs - 0.10.0__py3-none-any.whl - Mend

databricks-labs-lakebridge 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

databricks/labs/lakebridge/intermediate/__init__.py ADDED Viewed

File without changes

databricks/labs/lakebridge/intermediate/dag.py ADDED Viewed

@@ -0,0 +1,88 @@
+import logging
+logger = logging.getLogger(__name__)
+class Node:
+    def __init__(self, name: str):
+        self.name = name.lower()
+        self.children: list[str] = []
+        self.parents: list[str] = []
+    def add_parent(self, node: str) -> None:
+        self.parents.append(node)
+    def add_child(self, node: str) -> None:
+        self.children.append(node)
+    def __repr__(self) -> str:
+        return f"Node({self.name}, {self.children})"
+class DAG:
+    def __init__(self):
+        self.nodes: dict[str, Node] = {}
+    def add_node(self, node_name: str) -> None:
+        if node_name not in self.nodes and node_name not in {None, "none"}:
+            self.nodes[node_name.lower()] = Node(node_name.lower())
+    def add_edge(self, parent_name: str, child_name: str) -> None:
+        parent_name = parent_name.lower() if parent_name is not None else None
+        child_name = child_name.lower() if child_name is not None else None
+        logger.debug(f"Adding edge: {parent_name} -> {child_name}")
+        if parent_name not in self.nodes:
+            self.add_node(parent_name)
+        if child_name not in self.nodes:
+            self.add_node(child_name)
+        if child_name is not None:
+            self.nodes[parent_name].add_child(child_name)
+            self.nodes[child_name].add_parent(parent_name)
+    def identify_immediate_parents(self, table_name: str) -> list[str]:
+        table_name = table_name.lower()  # convert to lower() case
+        if table_name not in self.nodes:
+            logger.debug(f"Table with the name {table_name} not found in the DAG")
+            return []
+        return list(self.nodes[table_name].parents)
+    def identify_immediate_children(self, table_name: str) -> list[str]:
+        table_name = table_name.lower()  # convert to lower() case
+        if table_name not in self.nodes:
+            logger.debug(f"Table with the name {table_name} not found in the DAG")
+            return []
+        return list(self.nodes[table_name].children)
+    def _is_root_node(self, node_name: str) -> bool:
+        return len(self.identify_immediate_parents(node_name)) == 0
+    def walk_bfs(self, node: Node, level: int) -> set:
+        tables_at_level = set()
+        queue = [(node, 0)]  # The queue for the BFS. Each element is a tuple (node, level).
+        while queue:
+            current_node, node_level = queue.pop(0)
+            if node_level == level:
+                tables_at_level.add(current_node.name)
+            elif node_level > level:
+                break
+            for child_name in self.identify_immediate_children(current_node.name):
+                queue.append((self.nodes[child_name], node_level + 1))
+        return tables_at_level
+    def identify_root_tables(self, level: int) -> set:
+        all_nodes = set(self.nodes.values())
+        root_tables_at_level = set()
+        for node in all_nodes:
+            if self._is_root_node(node.name):
+                root_tables_at_level.update(self.walk_bfs(node, level))
+        return root_tables_at_level
+    def __repr__(self) -> str:
+        return str({node_name: str(node) for node_name, node in self.nodes.items()})

databricks/labs/lakebridge/intermediate/engine_adapter.py ADDED Viewed

File without changes

databricks/labs/lakebridge/intermediate/root_tables.py ADDED Viewed

@@ -0,0 +1,44 @@
+import logging
+from pathlib import Path
+from databricks.labs.lakebridge.helpers.file_utils import (
+    get_sql_file,
+    is_sql_file,
+    read_file,
+)
+from databricks.labs.lakebridge.intermediate.dag import DAG
+from databricks.labs.lakebridge.transpiler.sqlglot.sqlglot_engine import SqlglotEngine
+logger = logging.getLogger(__name__)
+class RootTableAnalyzer:
+    def __init__(self, engine: SqlglotEngine, source_dialect: str, input_path: Path):
+        self.engine = engine
+        self.source_dialect = source_dialect
+        self.input_path = input_path
+    def generate_lineage_dag(self) -> DAG:
+        dag = DAG()
+        # when input is sql file then parse the file
+        if is_sql_file(self.input_path):
+            logger.debug(f"Generating Lineage file: {self.input_path}")
+            sql_content = read_file(self.input_path)
+            self._populate_dag(sql_content, self.input_path, dag)
+            return dag  # return after processing the file
+        # when the input is a directory
+        for path in get_sql_file(self.input_path):
+            logger.debug(f"Generating Lineage file: {path}")
+            sql_content = read_file(path)
+            self._populate_dag(sql_content, path, dag)
+        return dag
+    def _populate_dag(self, sql_content: str, path: Path, dag: DAG):
+        for root_table, child in self.engine.analyse_table_lineage(self.source_dialect, sql_content, path):
+            dag.add_node(child)
+            dag.add_edge(root_table, child)

databricks/labs/lakebridge/jvmproxy.py ADDED Viewed

@@ -0,0 +1,56 @@
+import logging
+import os
+import sys
+import subprocess
+from databricks.labs.blueprint.entrypoint import find_project_root
+from databricks.labs.blueprint.cli import App
+def proxy_command(app: App, command: str):
+    def fn(**_):
+        proxy = JvmProxy()
+        proxy.run()
+    fn.__name__ = command
+    fn.__doc__ = f"Proxy to run {command} in JVM"
+    app.command(fn)
+class JvmProxy:
+    # TODO Refactor this class to use LSP protocol instead
+    def __init__(self):
+        self._root = find_project_root(__file__)
+        databricks_logger = logging.getLogger("databricks")
+        self._debug = databricks_logger.level == logging.DEBUG
+    def _recompile(self):
+        subprocess.run(
+            ["mvn", "compile", "-f", f'{self._root}/pom.xml'],
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+            check=True,
+        )
+    def run(self):
+        if self._debug:
+            self._recompile()
+        classpath = self._root / 'core/target/classpath.txt'
+        classes = self._root / 'core/target/scala-2.12/classes'
+        # TODO: use the os-specific path separator
+        args = [
+            "java",
+            "--class-path",
+            f'{classes.as_posix()}:{classpath.read_text()}',
+            "com.databricks.labs.lakebridge.Main",
+            sys.argv[1],
+        ]
+        with subprocess.Popen(
+            args,
+            stdin=sys.stdin,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+            env=os.environ.copy(),
+            text=True,
+        ) as process:
+            return process.wait()

databricks/labs/lakebridge/lineage.py ADDED Viewed

@@ -0,0 +1,42 @@
+import datetime
+import logging
+from pathlib import Path
+from databricks.labs.lakebridge.intermediate.dag import DAG
+from databricks.labs.lakebridge.intermediate.root_tables import RootTableAnalyzer
+from databricks.labs.lakebridge.transpiler.sqlglot.sqlglot_engine import SqlglotEngine
+logger = logging.getLogger(__name__)
+def _generate_dot_file_contents(dag: DAG) -> str:
+    _lineage_str = "flowchart TD\n"
+    for node_name, node in dag.nodes.items():
+        if node.parents:
+            for parent in node.parents:
+                _lineage_str += f"    {node_name.capitalize()} --> {parent.capitalize()}\n"
+        else:
+            # Include nodes without parents to ensure they appear in the diagram
+            _lineage_str += f"    {node_name.capitalize()}\n"
+    return _lineage_str
+def lineage_generator(engine: SqlglotEngine, source_dialect: str, input_source: str, output_folder: str):
+    input_sql_path = Path(input_source)
+    output_folder = output_folder if output_folder.endswith('/') else output_folder + '/'
+    msg = f"Processing for SQLs at this location: {input_sql_path}"
+    logger.info(msg)
+    root_table_analyzer = RootTableAnalyzer(engine, source_dialect, input_sql_path)
+    generated_dag = root_table_analyzer.generate_lineage_dag()
+    lineage_file_content = _generate_dot_file_contents(generated_dag)
+    date_str = datetime.datetime.now().strftime("%d%m%y")
+    output_filename = Path(f"{output_folder}lineage_{date_str}.dot")
+    if output_filename.exists():
+        logger.warning(f'The output file already exists and will be replaced: {output_filename}')
+    logger.info(f"Attempting to write the lineage to {output_filename}")
+    with output_filename.open('w', encoding='utf-8') as f:
+        f.write(lineage_file_content)
+    logger.info(f"Succeeded to write the lineage to {output_filename}")

databricks/labs/lakebridge/reconcile/__init__.py ADDED Viewed

File without changes

databricks/labs/lakebridge/reconcile/compare.py ADDED Viewed

@@ -0,0 +1,414 @@
+import logging
+from functools import reduce
+from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.functions import col, expr, lit
+from databricks.labs.lakebridge.reconcile.exception import ColumnMismatchException
+from databricks.labs.lakebridge.reconcile.recon_capture import (
+    ReconIntermediatePersist,
+)
+from databricks.labs.lakebridge.reconcile.recon_output_config import (
+    DataReconcileOutput,
+    MismatchOutput,
+)
+from databricks.labs.lakebridge.reconcile.recon_config import (
+    AggregateRule,
+    ColumnMapping,
+)
+logger = logging.getLogger(__name__)
+_HASH_COLUMN_NAME = "hash_value_recon"
+_SAMPLE_ROWS = 50
+def raise_column_mismatch_exception(msg: str, source_missing: list[str], target_missing: list[str]) -> Exception:
+    error_msg = (
+        f"{msg}\n"
+        f"columns missing in source: {','.join(source_missing) if source_missing else None}\n"
+        f"columns missing in target: {','.join(target_missing) if target_missing else None}\n"
+    )
+    return ColumnMismatchException(error_msg)
+def _generate_join_condition(source_alias, target_alias, key_columns):
+    conditions = [
+        col(f"{source_alias}.{key_column}").eqNullSafe(col(f"{target_alias}.{key_column}"))
+        for key_column in key_columns
+    ]
+    return reduce(lambda a, b: a & b, conditions)
+def reconcile_data(
+    source: DataFrame,
+    target: DataFrame,
+    key_columns: list[str],
+    report_type: str,
+    spark: SparkSession,
+    path: str,
+) -> DataReconcileOutput:
+    source_alias = "src"
+    target_alias = "tgt"
+    if report_type not in {"data", "all"}:
+        key_columns = [_HASH_COLUMN_NAME]
+    df = (
+        source.alias(source_alias)
+        .join(
+            other=target.alias(target_alias),
+            on=_generate_join_condition(source_alias, target_alias, key_columns),
+            how="full",
+        )
+        .selectExpr(
+            *[f'{source_alias}.{col_name} as {source_alias}_{col_name}' for col_name in source.columns],
+            *[f'{target_alias}.{col_name} as {target_alias}_{col_name}' for col_name in target.columns],
+        )
+    )
+    # Write unmatched df to volume
+    df = ReconIntermediatePersist(spark, path).write_and_read_unmatched_df_with_volumes(df)
+    logger.warning(f"Unmatched data is written to {path} successfully")
+    mismatch = _get_mismatch_data(df, source_alias, target_alias) if report_type in {"all", "data"} else None
+    missing_in_src = (
+        df.filter(col(f"{source_alias}_{_HASH_COLUMN_NAME}").isNull())
+        .select(
+            *[
+                col(col_name).alias(col_name.replace(f'{target_alias}_', '').lower())
+                for col_name in df.columns
+                if col_name.startswith(f'{target_alias}_')
+            ]
+        )
+        .drop(_HASH_COLUMN_NAME)
+    )
+    missing_in_tgt = (
+        df.filter(col(f"{target_alias}_{_HASH_COLUMN_NAME}").isNull())
+        .select(
+            *[
+                col(col_name).alias(col_name.replace(f'{source_alias}_', '').lower())
+                for col_name in df.columns
+                if col_name.startswith(f'{source_alias}_')
+            ]
+        )
+        .drop(_HASH_COLUMN_NAME)
+    )
+    mismatch_count = 0
+    if mismatch:
+        mismatch_count = mismatch.count()
+    missing_in_src_count = missing_in_src.count()
+    missing_in_tgt_count = missing_in_tgt.count()
+    return DataReconcileOutput(
+        mismatch_count=mismatch_count,
+        missing_in_src_count=missing_in_src_count,
+        missing_in_tgt_count=missing_in_tgt_count,
+        missing_in_src=missing_in_src.limit(_SAMPLE_ROWS),
+        missing_in_tgt=missing_in_tgt.limit(_SAMPLE_ROWS),
+        mismatch=MismatchOutput(mismatch_df=mismatch),
+    )
+def _get_mismatch_data(df: DataFrame, src_alias: str, tgt_alias: str) -> DataFrame:
+    return (
+        df.filter(
+            (col(f"{src_alias}_{_HASH_COLUMN_NAME}").isNotNull())
+            & (col(f"{tgt_alias}_{_HASH_COLUMN_NAME}").isNotNull())
+        )
+        .withColumn(
+            "hash_match",
+            col(f"{src_alias}_{_HASH_COLUMN_NAME}") == col(f"{tgt_alias}_{_HASH_COLUMN_NAME}"),
+        )
+        .filter(col("hash_match") == lit(False))
+        .select(
+            *[
+                col(col_name).alias(col_name.replace(f'{src_alias}_', '').lower())
+                for col_name in df.columns
+                if col_name.startswith(f'{src_alias}_')
+            ]
+        )
+        .drop(_HASH_COLUMN_NAME)
+    )
+def _convert_columns_to_lowercase(df: DataFrame) -> DataFrame:
+    lowercased_columns = [col(column).alias(column.lower()) for column in df.columns]
+    return df.select(*lowercased_columns)
+def capture_mismatch_data_and_columns(source: DataFrame, target: DataFrame, key_columns: list[str]) -> MismatchOutput:
+    source_df = _convert_columns_to_lowercase(source)
+    target_df = _convert_columns_to_lowercase(target)
+    source_columns = source_df.columns
+    target_columns = target_df.columns
+    if source_columns != target_columns:
+        message = "source and target should have same columns for capturing the mismatch data"
+        source_missing = [column for column in target_columns if column not in source_columns]
+        target_missing = [column for column in source_columns if column not in target_columns]
+        raise raise_column_mismatch_exception(message, source_missing, target_missing)
+    check_columns = [column for column in source_columns if column not in key_columns]
+    mismatch_df = _get_mismatch_df(source_df, target_df, key_columns, check_columns)
+    mismatch_columns = _get_mismatch_columns(mismatch_df, check_columns)
+    return MismatchOutput(mismatch_df, mismatch_columns)
+def _get_mismatch_columns(df: DataFrame, columns: list[str]):
+    # Collect the DataFrame to a local variable
+    local_df = df.collect()
+    mismatch_columns = []
+    for column in columns:
+        # Check if any row has False in the column
+        if any(not row[column + "_match"] for row in local_df):
+            mismatch_columns.append(column)
+    return mismatch_columns
+def _get_mismatch_df(source: DataFrame, target: DataFrame, key_columns: list[str], column_list: list[str]):
+    source_aliased = [col('base.' + column).alias(column + '_base') for column in column_list]
+    target_aliased = [col('compare.' + column).alias(column + '_compare') for column in column_list]
+    match_expr = [expr(f"{column}_base=={column}_compare").alias(column + "_match") for column in column_list]
+    key_cols = [col(column) for column in key_columns]
+    select_expr = key_cols + source_aliased + target_aliased + match_expr
+    filter_columns = " and ".join([column + "_match" for column in column_list])
+    filter_expr = ~expr(filter_columns)
+    logger.info(f"KEY COLUMNS: {key_columns}")
+    logger.info(f"FILTER COLUMNS: {filter_expr}")
+    logger.info(f"SELECT COLUMNS: {select_expr}")
+    mismatch_df = (
+        source.alias('base').join(other=target.alias('compare'), on=key_columns, how="inner").select(*select_expr)
+    )
+    compare_columns = [column for column in mismatch_df.columns if column not in key_columns]
+    return mismatch_df.select(*key_columns + sorted(compare_columns))
+def alias_column_str(alias: str, columns: list[str]) -> list[str]:
+    return [f"{alias}.{column}" for column in columns]
+def _generate_agg_join_condition(source_alias: str, target_alias: str, key_columns: list[str]):
+    join_columns: list[ColumnMapping] = [
+        ColumnMapping(source_name=f"source_group_by_{key_col}", target_name=f"target_group_by_{key_col}")
+        for key_col in key_columns
+    ]
+    conditions = [
+        col(f"{source_alias}.{mapping.source_name}").eqNullSafe(col(f"{target_alias}.{mapping.target_name}"))
+        for mapping in join_columns
+    ]
+    return reduce(lambda a, b: a & b, conditions)
+def _agg_conditions(
+    cols: list[ColumnMapping] | None,
+    condition_type: str = "group_filter",
+    op_type: str = "and",
+):
+    """
+    Generate conditions for aggregated data comparison based on the condition type
+    and reduces it based on the operator (and, or)
+    e.g.,  cols = [(source_min_col1, target_min_col1)]
+              1. condition_type = "group_filter"
+                    source_group_by_col1 is not null and target_group_by_col1 is not null
+              2. condition_type = "select"
+                    source_min_col1 == target_min_col1
+              3. condition_type = "missing_in_src"
+                    source_min_col1 is null
+              4. condition_type = "missing_in_tgt"
+                      target_min_col1 is null
+    :param cols:  List of columns to compare
+    :param condition_type:  Type of condition to generate
+    :param op_type: and, or
+    :return:  Reduced column expressions
+    """
+    assert cols, "Columns must be specified for aggregation conditions"
+    if condition_type == "group_filter":
+        conditions_list = [
+            (col(f"{mapping.source_name}").isNotNull() & col(f"{mapping.target_name}").isNotNull()) for mapping in cols
+        ]
+    elif condition_type == "select":
+        conditions_list = [col(f"{mapping.source_name}") == col(f"{mapping.target_name}") for mapping in cols]
+    elif condition_type == "missing_in_src":
+        conditions_list = [col(f"{mapping.source_name}").isNull() for mapping in cols]
+    elif condition_type == "missing_in_tgt":
+        conditions_list = [col(f"{mapping.target_name}").isNull() for mapping in cols]
+    else:
+        raise ValueError(f"Invalid condition type: {condition_type}")
+    return reduce(lambda a, b: a & b if op_type == "and" else a | b, conditions_list)
+def _generate_match_columns(select_cols: list[ColumnMapping]):
+    """
+    Generate match columns for the given select columns
+    e.g.,  select_cols = [(source_min_col1, target_min_col1), (source_count_col3, target_count_col3)]
+            |--------------------------------------|---------------------|
+           |               match_min_col1                      |  match_count_col3 |
+           |--------------------------------------|--------------------|
+             source_min_col1 == target_min_col1 | source_count_col3 == target_count_col3
+           --------------------------------------|---------------------|
+    :param select_cols:
+    :return:
+    """
+    items = []
+    for mapping in select_cols:
+        match_col_name = mapping.source_name.replace("source_", "match_")
+        items.append((match_col_name, col(f"{mapping.source_name}") == col(f"{mapping.target_name}")))
+    return items
+def _get_mismatch_agg_data(
+    df: DataFrame,
+    select_cols: list[ColumnMapping],
+    group_cols: list[ColumnMapping] | None,
+) -> DataFrame:
+    # TODO:  Integrate with _get_mismatch_data function
+    """
+    For each rule select columns, generate a match column to compare the aggregated data between Source and Target
+      e.g., select_cols = [(source_min_col1, target_min_col1), (source_count_col3, target_count_col3)]
+            source_min_col1 | target_min_col1 | match_min_col1 |  agg_data_match |
+            -----------------|--------------------|----------------|-------------------|
+                   11    |   12    |source_min_col1 == target_min_col1 | False                     |
+    :param df: Joined DataFrame with aggregated data from Source and Target
+    :param select_cols:  Rule specific select columns
+    :param group_cols: Rule specific group by columns, if any
+    :return: DataFrame with match_<AGG_TYPE>_<COLUMN> and agg_data_match columns
+                 to identify the aggregate data mismatch between Source and Target
+    """
+    df_with_match_cols = df
+    if group_cols:
+        # Filter Conditions are in the format of: source_group_by_col1 is not null and target_group_by_col1 is not null
+        filter_conditions = _agg_conditions(group_cols)
+        df_with_match_cols = df_with_match_cols.filter(filter_conditions)
+    # Generate match columns for the select columns. e.g., match_<AGG_TYPE>_<COLUMN>
+    for match_column_name, match_column in _generate_match_columns(select_cols):
+        df_with_match_cols = df_with_match_cols.withColumn(match_column_name, match_column)
+    # e.g., source_min_col1 == target_min_col1 and source_count_col3 == target_count_col3
+    select_conditions = _agg_conditions(select_cols, "select")
+    return df_with_match_cols.withColumn("agg_data_match", select_conditions).filter(
+        col("agg_data_match") == lit(False)
+    )
+def reconcile_agg_data_per_rule(
+    joined_df: DataFrame,
+    source_columns: list[str],
+    target_columns: list[str],
+    rule: AggregateRule,
+) -> DataReconcileOutput:
+    """ "
+    Generates the reconciliation output for the given rule
+    """
+    # Generates select columns in the format of:
+    # [(source_min_col1, target_min_col1), (source_count_col3, target_count_col3) ... ]
+    rule_select_columns = [
+        ColumnMapping(
+            source_name=f"source_{rule.agg_type}_{rule.agg_column}",
+            target_name=f"target_{rule.agg_type}_{rule.agg_column}",
+        )
+    ]
+    rule_group_columns = None
+    if rule.group_by_columns:
+        rule_group_columns = [
+            ColumnMapping(source_name=f"source_group_by_{group_col}", target_name=f"target_group_by_{group_col}")
+            for group_col in rule.group_by_columns
+        ]
+        rule_select_columns.extend(rule_group_columns)
+    df_rule_columns = []
+    for mapping in rule_select_columns:
+        df_rule_columns.extend([mapping.source_name, mapping.target_name])
+    joined_df_with_rule_cols = joined_df.selectExpr(*df_rule_columns)
+    # Data mismatch between Source and Target aggregated data
+    mismatch = _get_mismatch_agg_data(joined_df_with_rule_cols, rule_select_columns, rule_group_columns)
+    # Data missing in Source DataFrame
+    rule_target_columns = set(target_columns).intersection([mapping.target_name for mapping in rule_select_columns])
+    missing_in_src = joined_df_with_rule_cols.filter(_agg_conditions(rule_select_columns, "missing_in_src")).select(
+        *rule_target_columns
+    )
+    # Data missing in Target DataFrame
+    rule_source_columns = set(source_columns).intersection([mapping.source_name for mapping in rule_select_columns])
+    missing_in_tgt = joined_df_with_rule_cols.filter(_agg_conditions(rule_select_columns, "missing_in_tgt")).select(
+        *rule_source_columns
+    )
+    mismatch_count = 0
+    if mismatch:
+        mismatch_count = mismatch.count()
+    rule_reconcile_output = DataReconcileOutput(
+        mismatch_count=mismatch_count,
+        missing_in_src_count=missing_in_src.count(),
+        missing_in_tgt_count=missing_in_tgt.count(),
+        missing_in_src=missing_in_src.limit(_SAMPLE_ROWS),
+        missing_in_tgt=missing_in_tgt.limit(_SAMPLE_ROWS),
+        mismatch=MismatchOutput(mismatch_df=mismatch),
+    )
+    return rule_reconcile_output
+def join_aggregate_data(
+    source: DataFrame,
+    target: DataFrame,
+    key_columns: list[str] | None,
+    spark: SparkSession,
+    path: str,
+) -> DataFrame:
+    # TODO:  Integrate with reconcile_data function
+    source_alias = "src"
+    target_alias = "tgt"
+    # Generates group by columns in the format of:
+    # [(source_group_by_col1, target_group_by_col1), (source_group_by_col2, target_group_by_col2) ... ]
+    if key_columns:
+        # If there are Group By columns, do Full join on the grouped columns
+        df = source.alias(source_alias).join(
+            other=target.alias(target_alias),
+            on=_generate_agg_join_condition(source_alias, target_alias, key_columns),
+            how="full",
+        )
+    else:
+        # If there is no Group By condition, do Cross join as there is only one record
+        df = source.alias(source_alias).join(
+            other=target.alias(target_alias),
+            how="cross",
+        )
+    joined_df = df.selectExpr(
+        *source.columns,
+        *target.columns,
+    )
+    # Write the joined df to volume path
+    joined_volume_df = ReconIntermediatePersist(spark, path).write_and_read_unmatched_df_with_volumes(joined_df).cache()
+    logger.warning(f"Unmatched data is written to {path} successfully")
+    return joined_volume_df

databricks/labs/lakebridge/reconcile/connectors/__init__.py ADDED Viewed

File without changes