PyPI - Flowfile - Versions diffs - 0.3.4.1__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend - Supply Chain Defender

Flowfile 0.3.4.1py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of Flowfile might be problematic. Click here for more details.

Files changed (122) hide show

flowfile_core/flowfile/flow_data_engine/flow_data_engine.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os
 from copy import deepcopy
 from dataclasses import dataclass
 from math import ceil
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar
 # Third-party imports
 from loky import Future
@@ -16,25 +16,34 @@ from pyarrow.parquet import ParquetFile
 # Local imports - Core
 from flowfile_core.configs import logger
+from flowfile_core.utils.utils import ensure_similarity_dicts
 from flowfile_core.configs.flow_logger import NodeLogger
 from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
 from flowfile_core.schemas import (
+    cloud_storage_schemas,
     input_schema,
     transform_schema as transform_schemas
 )
 # Local imports - Flow File Components
 from flowfile_core.flowfile.flow_data_engine import utils
+from flowfile_core.flowfile.flow_data_engine.cloud_storage_reader import (CloudStorageReader,
+                                                                          ensure_path_has_wildcard_pattern,
+                                                                          get_first_file_from_s3_dir)
 from flowfile_core.flowfile.flow_data_engine.create import funcs as create_funcs
 from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
     FlowfileColumn,
+    assert_if_flowfile_schema,
     convert_stats_to_column_info
 )
 from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
 from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
 from flowfile_core.flowfile.flow_data_engine.join import (
     verify_join_select_integrity,
-    verify_join_map_integrity
+    verify_join_map_integrity,
+    rename_df_table_for_join,
+    get_undo_rename_mapping_join,
+    get_col_name_to_delete
 )
 from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
 from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
@@ -52,6 +61,55 @@ from flowfile_core.flowfile.flow_data_engine.threaded_processes import (
 from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
+T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
+def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform_schemas.JoinInput) -> Tuple[T, T, Dict[str, str]]:
+    def _construct_temp_name(column_name: str) -> str:
+        return "__FL_TEMP__"+column_name
+    if join_input.how == 'right':
+        left_df = left_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
+                                       for jk in join_input.left_select.join_key_selects)
+        reverse_actions = {
+            _construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("left", jk.new_name)
+            for jk in join_input.left_select.join_key_selects}
+    elif join_input.how in ('left', 'inner'):
+        right_df = right_df.with_columns(pl.col(jk.new_name).alias(_construct_temp_name(jk.new_name))
+                                       for jk in join_input.right_select.join_key_selects)
+        reverse_actions = {
+            _construct_temp_name(jk.new_name): transform_schemas.construct_join_key_name("right", jk.new_name)
+            for jk in join_input.right_select.join_key_selects}
+    else:
+        reverse_actions = {}
+    return left_df, right_df, reverse_actions
+def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.JoinInput) -> None:
+    """
+    Updates the right columns of the join input by deselecting them.
+    Args:
+        join_input ():
+    Returns:
+        None
+    """
+    if join_input.how in ('semi', 'anti'):
+        for jk in join_input.right_select.renames:
+            jk.keep = False
+def get_select_columns(full_select_input: List[transform_schemas.SelectInput]) -> List[str]:
+    """
+    Gets the list of column names to select from the full select input.
+    It filters out columns that are not marked to keep or join keys, and only includes those that are available.
+    Args:
+        full_select_input (): List of SelectInput objects containing column information.
+    Returns:
+        List of column names to select.
+    """
+    return [v.old_name for v in full_select_input if (v.keep or v.join_key) and v.is_available]
 @dataclass
 class FlowDataEngine:
@@ -110,7 +168,7 @@ class FlowDataEngine:
     # flow_id: int = None  # TODO: Implement flow_id
     def __init__(self,
-                 raw_data: Union[List[Dict], List[Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
+                 raw_data: Union[List[Dict], List[Any], Dict[str, Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
                  path_ref: str = None,
                  name: str = None,
                  optimize_memory: bool = True,
@@ -129,7 +187,6 @@ class FlowDataEngine:
             self._handle_path_ref(path_ref, optimize_memory)
         else:
             self.initialize_empty_fl()
         self._finalize_initialization(name, optimize_memory, schema, calculate_schema_stats)
     def _initialize_attributes(self, number_of_records_callback, data_callback, streamable):
@@ -172,6 +229,7 @@ class FlowDataEngine:
         elif optimize_memory:
             self.number_of_records = -1
         else:
+            # TODO: assess whether this leads to slow downs with multi remote files
             self.number_of_records = lf.select(pl.len()).collect()[0, 0]
     def _handle_python_data(self, data: Union[List, Dict]):
@@ -187,12 +245,13 @@ class FlowDataEngine:
             self.initialize_empty_fl()
         lengths = [len(v) if isinstance(v, (list, tuple)) else 1 for v in data.values()]
-        if len(set(lengths)) == 1 and lengths[0]>1:
+        if len(set(lengths)) == 1 and lengths[0] > 1:
             self.number_of_records = lengths[0]
             self.data_frame = pl.DataFrame(data)
         else:
             self.number_of_records = 1
             self.data_frame = pl.DataFrame([data])
+        self.lazy = True
     def _handle_raw_data_format(self, raw_data: input_schema.RawData):
         """Create a FlowDataEngine from a RawData object."""
@@ -226,13 +285,384 @@ class FlowDataEngine:
         if not (isinstance(data[0], dict) or hasattr(data[0], '__dict__')):
             try:
                 return pl.DataFrame(data).to_dicts()
-            except:
+            except TypeError:
                 raise Exception('Value must be able to be converted to dictionary')
+            except Exception as e:
+                raise Exception(f'Value must be able to be converted to dictionary: {e}')
         if not isinstance(data[0], dict):
             data = [row.__dict__ for row in data]
-        return utils.ensure_similarity_dicts(data)
+        return ensure_similarity_dicts(data)
+    def to_cloud_storage_obj(self, settings: cloud_storage_schemas.CloudStorageWriteSettingsInternal):
+        """
+        Write the FlowDataEngine's data to an object in cloud storage.
+        Supports writing to S3, Azure ADLS, and Google Cloud Storage. The 'overwrite'
+        write mode is supported. The 'append' mode is not yet implemented.
+        Args:
+            settings: Cloud storage write settings with connection details and write options.
+        Raises:
+            ValueError: If file format is not supported.
+            NotImplementedError: If the 'append' write mode is used.
+            Exception: If writing to cloud storage fails.
+        """
+        connection = settings.connection
+        write_settings = settings.write_settings
+        logger.info(f"Writing to {connection.storage_type} storage: {write_settings.resource_path}")
+        if write_settings.write_mode == 'append' and write_settings.file_format != "delta":
+            raise NotImplementedError("The 'append' write mode is not yet supported for this destination.")
+        storage_options = CloudStorageReader.get_storage_options(connection)
+        credential_provider = CloudStorageReader.get_credential_provider(connection)
+        # Dispatch to the correct writer based on file format
+        if write_settings.file_format == "parquet":
+            self._write_parquet_to_cloud(
+                write_settings.resource_path,
+                storage_options,
+                credential_provider,
+                write_settings
+            )
+        elif write_settings.file_format == "delta":
+            self._write_delta_to_cloud(
+                write_settings.resource_path,
+                storage_options,
+                credential_provider,
+                write_settings
+            )
+        elif write_settings.file_format == "csv":
+            self._write_csv_to_cloud(
+                write_settings.resource_path,
+                storage_options,
+                credential_provider,
+                write_settings
+            )
+        elif write_settings.file_format == "json":
+            self._write_json_to_cloud(
+                write_settings.resource_path,
+                storage_options,
+                credential_provider,
+                write_settings
+            )
+        else:
+            raise ValueError(f"Unsupported file format for writing: {write_settings.file_format}")
+        logger.info(f"Successfully wrote data to {write_settings.resource_path}")
+    def _write_parquet_to_cloud(self,
+                                resource_path: str,
+                                storage_options: Dict[str, Any],
+                                credential_provider: Optional[Callable],
+                                write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
+        """Write LazyFrame to a Parquet file in cloud storage."""
+        try:
+            sink_kwargs = {
+                "path": resource_path,
+                "compression": write_settings.parquet_compression,
+            }
+            if storage_options:
+                sink_kwargs["storage_options"] = storage_options
+            if credential_provider:
+                sink_kwargs["credential_provider"] = credential_provider
+            try:
+                self.data_frame.sink_parquet(**sink_kwargs)
+            except:
+                pl_df = self.collect()
+                sink_kwargs['file'] = sink_kwargs.pop("path")
+                pl_df.write_parquet(**sink_kwargs)
+        except Exception as e:
+            logger.error(f"Failed to write Parquet to {resource_path}: {str(e)}")
+            raise Exception(f"Failed to write Parquet to cloud storage: {str(e)}")
+    def _write_delta_to_cloud(self,
+                              resource_path: str,
+                              storage_options: Dict[str, Any],
+                              credential_provider: Optional[Callable],
+                              write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
+        sink_kwargs = {
+            "target": resource_path,
+            "mode": write_settings.write_mode,
+        }
+        if storage_options:
+            sink_kwargs["storage_options"] = storage_options
+        if credential_provider:
+            sink_kwargs["credential_provider"] = credential_provider
+        self.collect().write_delta(**sink_kwargs)
+    def _write_csv_to_cloud(self,
+                            resource_path: str,
+                            storage_options: Dict[str, Any],
+                            credential_provider: Optional[Callable],
+                            write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
+        """Write LazyFrame to a CSV file in cloud storage."""
+        try:
+            sink_kwargs = {
+                "path": resource_path,
+                "separator": write_settings.csv_delimiter,
+            }
+            if storage_options:
+                sink_kwargs["storage_options"] = storage_options
+            if credential_provider:
+                sink_kwargs["credential_provider"] = credential_provider
+            # sink_csv executes the lazy query and writes the result
+            self.data_frame.sink_csv(**sink_kwargs)
+        except Exception as e:
+            logger.error(f"Failed to write CSV to {resource_path}: {str(e)}")
+            raise Exception(f"Failed to write CSV to cloud storage: {str(e)}")
+    def _write_json_to_cloud(self,
+                             resource_path: str,
+                             storage_options: Dict[str, Any],
+                             credential_provider: Optional[Callable],
+                             write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
+        """Write LazyFrame to a line-delimited JSON (NDJSON) file in cloud storage."""
+        try:
+            sink_kwargs = {"path": resource_path}
+            if storage_options:
+                sink_kwargs["storage_options"] = storage_options
+            if credential_provider:
+                sink_kwargs["credential_provider"] = credential_provider
+            self.data_frame.sink_ndjson(**sink_kwargs)
+        except Exception as e:
+            logger.error(f"Failed to write JSON to {resource_path}: {str(e)}")
+            raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
+    @classmethod
+    def from_cloud_storage_obj(cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal):
+        """
+        Create a FlowDataEngine from an object in cloud storage.
+        Supports reading from S3, Azure ADLS, and Google Cloud Storage with various
+        authentication methods including access keys, IAM roles, and CLI credentials.
+        Args:
+            settings: Cloud storage read settings with connection details and read options
+        Returns:
+            FlowDataEngine: New instance with data from cloud storage
+        Raises:
+            ValueError: If storage type or file format is not supported
+            Exception: If reading from cloud storage fails
+        """
+        connection = settings.connection
+        read_settings = settings.read_settings
+        logger.info(f"Reading from {connection.storage_type} storage: {read_settings.resource_path}")
+        # Get storage options based on connection type
+        storage_options = CloudStorageReader.get_storage_options(connection)
+        # Get credential provider if needed
+        credential_provider = CloudStorageReader.get_credential_provider(connection)
+        if read_settings.file_format == "parquet":
+            return cls._read_parquet_from_cloud(
+                read_settings.resource_path,
+                storage_options,
+                credential_provider,
+                read_settings.scan_mode == "directory",
+            )
+        elif read_settings.file_format == "delta":
+            return cls._read_delta_from_cloud(
+                read_settings.resource_path,
+                storage_options,
+                credential_provider,
+                read_settings
+            )
+        elif read_settings.file_format == "csv":
+            return cls._read_csv_from_cloud(
+                read_settings.resource_path,
+                storage_options,
+                credential_provider,
+                read_settings
+            )
+        elif read_settings.file_format == "json":
+            return cls._read_json_from_cloud(
+                read_settings.resource_path,
+                storage_options,
+                credential_provider,
+                read_settings.scan_mode == "directory"
+            )
+        elif read_settings.file_format == "iceberg":
+            return cls._read_iceberg_from_cloud(
+                read_settings.resource_path,
+                storage_options,
+                credential_provider,
+                read_settings
+            )
+        elif read_settings.file_format in ["delta", "iceberg"]:
+            # These would require additional libraries
+            raise NotImplementedError(f"File format {read_settings.file_format} not yet implemented")
+        else:
+            raise ValueError(f"Unsupported file format: {read_settings.file_format}")
+    @staticmethod
+    def _get_schema_from_first_file_in_dir(source: str, storage_options: Dict[str, Any]) -> List[FlowfileColumn] | None:
+        try:
+            first_file_ref = get_first_file_from_s3_dir(source, storage_options=storage_options)
+            return convert_stats_to_column_info(FlowDataEngine._create_schema_stats_from_pl_schema(
+                pl.scan_parquet(first_file_ref, storage_options=storage_options).collect_schema()))
+        except Exception as e:
+            logger.warning(f"Could not read schema from first file in directory, using default schema: {e}")
+    @classmethod
+    def _read_iceberg_from_cloud(cls,
+                                 resource_path: str,
+                                 storage_options: Dict[str, Any],
+                                 credential_provider: Optional[Callable],
+                                 read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
+        """Read Iceberg table(s) from cloud storage."""
+        raise NotImplementedError(f"Failed to read Iceberg table from cloud storage: Not yet implemented")
+    @classmethod
+    def _read_parquet_from_cloud(cls,
+                                 resource_path: str,
+                                 storage_options: Dict[str, Any],
+                                 credential_provider: Optional[Callable],
+                                 is_directory: bool) -> "FlowDataEngine":
+        """Read Parquet file(s) from cloud storage."""
+        try:
+            # Use scan_parquet for lazy evaluation
+            if is_directory:
+                resource_path = ensure_path_has_wildcard_pattern(resource_path=resource_path, file_format="parquet")
+            scan_kwargs = {"source": resource_path}
+            if storage_options:
+                scan_kwargs["storage_options"] = storage_options
+            if credential_provider:
+                scan_kwargs["credential_provider"] = credential_provider
+            if storage_options and is_directory:
+                schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options)
+            else:
+                schema = None
+            lf = pl.scan_parquet(**scan_kwargs)
+            return cls(
+                lf,
+                number_of_records=6_666_666,  # Set to 6666666 so that the provider is not accessed for this stat
+                optimize_memory=True,
+                streamable=True,
+                schema=schema
+            )
+        except Exception as e:
+            logger.error(f"Failed to read Parquet from {resource_path}: {str(e)}")
+            raise Exception(f"Failed to read Parquet from cloud storage: {str(e)}")
+    @classmethod
+    def _read_delta_from_cloud(cls,
+                               resource_path: str,
+                               storage_options: Dict[str, Any],
+                               credential_provider: Optional[Callable],
+                               read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
+        try:
+            logger.info("Reading Delta file from cloud storage...")
+            logger.info(f"read_settings: {read_settings}")
+            scan_kwargs = {"source": resource_path}
+            if read_settings.delta_version:
+                scan_kwargs['version'] = read_settings.delta_version
+            if storage_options:
+                scan_kwargs["storage_options"] = storage_options
+            if credential_provider:
+                scan_kwargs["credential_provider"] = credential_provider
+            lf = pl.scan_delta(**scan_kwargs)
+            return cls(
+                lf,
+                number_of_records=6_666_666,  # Set to 6666666 so that the provider is not accessed for this stat
+                optimize_memory=True,
+                streamable=True
+            )
+        except Exception as e:
+            logger.error(f"Failed to read Delta file from {resource_path}: {str(e)}")
+            raise Exception(f"Failed to read Delta file from cloud storage: {str(e)}")
+    @classmethod
+    def _read_csv_from_cloud(cls,
+                             resource_path: str,
+                             storage_options: Dict[str, Any],
+                             credential_provider: Optional[Callable],
+                             read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
+        """Read CSV file(s) from cloud storage."""
+        try:
+            scan_kwargs = {
+                "source": resource_path,
+                "has_header": read_settings.csv_has_header,
+                "separator": read_settings.csv_delimiter,
+                "encoding": read_settings.csv_encoding,
+            }
+            if storage_options:
+                scan_kwargs["storage_options"] = storage_options
+            if credential_provider:
+                scan_kwargs["credential_provider"] = credential_provider
+            if read_settings.scan_mode == "directory":
+                resource_path = ensure_path_has_wildcard_pattern(resource_path=resource_path, file_format="csv")
+                scan_kwargs["source"] = resource_path
+            if storage_options and read_settings.scan_mode == "directory":
+                schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options)
+            else:
+                schema = None
+            lf = pl.scan_csv(**scan_kwargs)
+            return cls(
+                lf,
+                number_of_records=6_666_666,  # Will be calculated lazily
+                optimize_memory=True,
+                streamable=True,
+                schema=schema
+            )
+        except Exception as e:
+            logger.error(f"Failed to read CSV from {resource_path}: {str(e)}")
+            raise Exception(f"Failed to read CSV from cloud storage: {str(e)}")
+    @classmethod
+    def _read_json_from_cloud(cls,
+                              resource_path: str,
+                              storage_options: Dict[str, Any],
+                              credential_provider: Optional[Callable],
+                              is_directory: bool) -> "FlowDataEngine":
+        """Read JSON file(s) from cloud storage."""
+        try:
+            scan_kwargs = {"source": resource_path}
+            if storage_options:
+                scan_kwargs["storage_options"] = storage_options
+            if credential_provider:
+                scan_kwargs["credential_provider"] = credential_provider
+            if is_directory:
+                resource_path = ensure_path_has_wildcard_pattern(resource_path, "json")
+            if storage_options and is_directory:
+                schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options)
+            else:
+                schema = None
+            lf = pl.scan_ndjson(**scan_kwargs)  # Using NDJSON for line-delimited JSON
+            return cls(
+                lf,
+                number_of_records=-1,
+                optimize_memory=True,
+                streamable=True,
+                schema=schema
+            )
+        except Exception as e:
+            logger.error(f"Failed to read JSON from {resource_path}: {str(e)}")
+            raise Exception(f"Failed to read JSON from cloud storage: {str(e)}")
     def _handle_path_ref(self, path_ref: str, optimize_memory: bool):
         """Handle file path reference input."""
@@ -255,16 +685,20 @@ class FlowDataEngine:
         _ = calculate_schema_stats
         self.name = name
         self._optimize_memory = optimize_memory
-        pl_schema = self.data_frame.collect_schema()
-        self._schema = self._handle_schema(schema, pl_schema)
-        self.columns = [c.column_name for c in self._schema] if self._schema else pl_schema.names()
+        if assert_if_flowfile_schema(schema):
+            self._schema = schema
+            self.columns = [c.column_name for c in self._schema]
+        else:
+            pl_schema = self.data_frame.collect_schema()
+            self._schema = self._handle_schema(schema, pl_schema)
+            self.columns = [c.column_name for c in self._schema] if self._schema else pl_schema.names()
     def __getitem__(self, item):
         """Access a specific column or item from the DataFrame."""
         return self.data_frame.select([item])
     @property
-    def data_frame(self) -> pl.LazyFrame | pl.DataFrame:
+    def data_frame(self) -> pl.LazyFrame | pl.DataFrame | None:
         """Get the underlying DataFrame with appropriate handling of different states."""
         if self._data_frame is not None and not self.is_future:
             return self._data_frame
@@ -289,6 +723,16 @@ class FlowDataEngine:
             raise Exception('Cannot set a non-lazy dataframe to a lazy flowfile')
         self._data_frame = df
+    @staticmethod
+    def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> List[Dict]:
+        return [
+            dict(column_name=k, pl_datatype=v, col_index=i)
+            for i, (k, v) in enumerate(pl_schema.items())
+        ]
+    def _add_schema_from_schema_stats(self, schema_stats: List[Dict]):
+        self._schema = convert_stats_to_column_info(schema_stats)
     @property
     def schema(self) -> List[FlowfileColumn]:
         """Get the schema of the DataFrame, calculating if necessary."""
@@ -299,11 +743,8 @@ class FlowDataEngine:
                 schema_stats = self._calculate_schema()
                 self.ind_schema_calculated = True
             else:
-                schema_stats = [
-                    dict(column_name=k, pl_datatype=v, col_index=i)
-                    for i, (k, v) in enumerate(self.data_frame.collect_schema().items())
-                ]
-            self._schema = convert_stats_to_column_info(schema_stats)
+                schema_stats = self._create_schema_stats_from_pl_schema(self.data_frame.collect_schema())
+            self._add_schema_from_schema_stats(schema_stats)
         return self._schema
     @property
@@ -338,6 +779,7 @@ class FlowDataEngine:
     def _collect_data(self, n_records: int = None) -> pl.DataFrame:
         """Internal method to handle data collection."""
         if n_records is None:
             self.collect_external()
             if self._streamable:
                 try:
@@ -353,7 +795,7 @@ class FlowDataEngine:
             return self._collect_from_external_source(n_records)
         if self._streamable:
-            return self.data_frame.head(n_records).collect(engine="streaming", comm_subplan_elim=False)
+            return self.data_frame.head(n_records).collect(engine="streaming")
         return self.data_frame.head(n_records).collect()
     def _collect_from_external_source(self, n_records: int) -> pl.DataFrame:
@@ -480,8 +922,17 @@ class FlowDataEngine:
             return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dicts()
         return self.data_frame.to_dicts()
+    def to_raw_data(self) -> input_schema.RawData:
+        """Convert the DataFrame to a list of values."""
+        columns = [c.get_minimal_field_info() for c in self.schema]
+        data = list(self.to_dict().values())
+        return input_schema.RawData(columns=columns, data=data)
     def to_dict(self) -> Dict[str, List]:
-        return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
+        if self.lazy:
+            return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
+        else:
+            return self.data_frame.to_dict(as_series=False)
     @classmethod
     def create_from_external_source(cls, external_source: ExternalDataSource) -> "FlowDataEngine":
@@ -514,7 +965,6 @@ class FlowDataEngine:
     def create_from_path(cls, received_table: input_schema.ReceivedTableBase) -> "FlowDataEngine":
         """Create a FlowDataEngine from a file path."""
         received_table.set_absolute_filepath()
         file_type_handlers = {
             'csv': create_funcs.create_from_path_csv,
             'parquet': create_funcs.create_from_path_parquet,
@@ -541,25 +991,26 @@ class FlowDataEngine:
             length = 10_000_000
         return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
-    # Schema Handling Methods
-    def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema,
+    def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema | None,
                        pl_schema: pl.Schema) -> List[FlowfileColumn] | None:
         """Handle schema processing and validation."""
-        if schema is None:
+        if schema is None and pl_schema is not None:
+            return convert_stats_to_column_info(self._create_schema_stats_from_pl_schema(pl_schema))
+        elif schema is None and pl_schema is None:
             return None
-        if schema.__len__() != pl_schema.__len__():
-            raise Exception(
-                f'Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}')
-        if isinstance(schema, pl.Schema):
-            return self._handle_polars_schema(schema, pl_schema)
-        elif isinstance(schema, list) and len(schema) == 0:
-            return []
-        elif isinstance(schema[0], str):
-            return self._handle_string_schema(schema, pl_schema)
-        return schema
+        elif assert_if_flowfile_schema(schema) and pl_schema is None:
+            return schema
+        elif pl_schema is not None and schema is not None:
+            if schema.__len__() != pl_schema.__len__():
+                raise Exception(
+                    f'Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}')
+            if isinstance(schema, pl.Schema):
+                return self._handle_polars_schema(schema, pl_schema)
+            elif isinstance(schema, list) and len(schema) == 0:
+                return []
+            elif isinstance(schema[0], str):
+                return self._handle_string_schema(schema, pl_schema)
+            return schema
     def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> List[FlowfileColumn]:
         """Handle Polars schema conversion."""
@@ -847,7 +1298,6 @@ class FlowDataEngine:
         """
         n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=True))
         logging.info(f'Getting sample of {n_rows} rows')
         if random:
             if self.lazy and self.external_source is not None:
                 self.collect_external()
@@ -1049,21 +1499,15 @@ class FlowDataEngine:
         Raises:
             Exception: If join would result in too many records or is invalid
         """
-        # self.lazy = False if join_input.how == 'right' else True
-        # other.lazy = False if join_input.how == 'right' else True
+        ensure_right_unselect_for_semi_and_anti_joins(join_input)
         verify_join_select_integrity(join_input, left_columns=self.columns, right_columns=other.columns)
         if not verify_join_map_integrity(join_input, left_columns=self.schema, right_columns=other.schema):
             raise Exception('Join is not valid by the data fields')
         if auto_generate_selection:
             join_input.auto_rename()
-        right_select = [v.old_name for v in join_input.right_select.renames
-                        if (v.keep or v.join_key) and v.is_available]
-        left_select = [v.old_name for v in join_input.left_select.renames
-                       if (v.keep or v.join_key) and v.is_available]
-        left = self.data_frame.select(left_select).rename(join_input.left_select.rename_table)
-        right = other.data_frame.select(right_select).rename(join_input.right_select.rename_table)
+        left = self.data_frame.select(get_select_columns(join_input.left_select.renames)).rename(join_input.left_select.rename_table)
+        right = other.data_frame.select(get_select_columns(join_input.right_select.renames)).rename(join_input.right_select.rename_table)
         if verify_integrity and join_input.how != 'right':
             n_records = get_join_count(left, right, left_on_keys=join_input.left_join_keys,
@@ -1072,25 +1516,42 @@ class FlowDataEngine:
                 raise Exception("Join will result in too many records, ending process")
         else:
             n_records = -1
+        left, right, reverse_join_key_mapping = _handle_duplication_join_keys(left, right, join_input)
+        left, right = rename_df_table_for_join(left, right, join_input.get_join_key_renames())
         if join_input.how == 'right':
-            #  Default to left join since right join can give panic issues in execution plan downstream
-            joined_df = right.join(left, left_on=join_input.right_join_keys,
-                                   right_on=join_input.left_join_keys, how="left", suffix="")
+            joined_df = right.join(
+                other=left,
+                left_on=join_input.right_join_keys,
+                right_on=join_input.left_join_keys,
+                how="left",
+                suffix="").rename(reverse_join_key_mapping)
         else:
-            joined_df = left.join(right, left_on=join_input.left_join_keys,
-                                  right_on=join_input.right_join_keys,
-                                  how=join_input.how, suffix="")
-        cols_to_delete_after = [col.new_name for col in
-                                join_input.left_select.renames + join_input.left_select.renames
-                                if col.join_key and not col.keep and col.is_available]
-        if len(cols_to_delete_after) > 0:
-            joined_df = joined_df.drop(cols_to_delete_after)
+            joined_df = left.join(
+                other=right,
+                left_on=join_input.left_join_keys,
+                right_on=join_input.right_join_keys,
+                how=join_input.how,
+                suffix="").rename(reverse_join_key_mapping)
+        left_cols_to_delete_after = [get_col_name_to_delete(col, 'left') for col in join_input.left_select.renames
+                                     if not col.keep
+                                     and col.is_available and col.join_key
+                                     ]
+        right_cols_to_delete_after = [get_col_name_to_delete(col, 'right') for col in join_input.right_select.renames
+                                      if not col.keep
+                                      and col.is_available and col.join_key
+                                      and join_input.how in ("left", "right", "inner", "cross", "outer")
+                                      ]
+        if len(right_cols_to_delete_after + left_cols_to_delete_after) > 0:
+            joined_df = joined_df.drop(left_cols_to_delete_after + right_cols_to_delete_after)
+        undo_join_key_remapping = get_undo_rename_mapping_join(join_input)
+        joined_df = joined_df.rename(undo_join_key_remapping)
         if verify_integrity:
             return FlowDataEngine(joined_df, calculate_schema_stats=True,
-                                 number_of_records=n_records, streamable=False)
+                                  number_of_records=n_records, streamable=False)
         else:
             fl = FlowDataEngine(joined_df, calculate_schema_stats=False,
-                               number_of_records=0, streamable=False)
+                                number_of_records=0, streamable=False)
             return fl
     # Graph Operations
@@ -1152,6 +1613,7 @@ class FlowDataEngine:
         other.lazy = False
         self.number_of_records = -1
         other.number_of_records = -1
+        other = other.select_columns(self.columns)
         if self.get_number_of_records() != other.get_number_of_records():
             raise Exception('Number of records is not equal')
@@ -1556,3 +2018,4 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
     if isinstance(df, pl.DataFrame):
         logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
     return FlowDataEngine(df)