PyPI - Flowfile - Versions diffs - 0.3.1.2__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

Flowfile 0.3.1.2py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of Flowfile might be problematic. Click here for more details.

Files changed (100) hide show

flowfile_core/configs/settings.py CHANGED Viewed

@@ -5,10 +5,10 @@ import os
 import tempfile
 import argparse
-from databases import DatabaseURL
 from passlib.context import CryptContext
 from starlette.config import Config
-from starlette.datastructures import Secret
+from flowfile_core.configs.utils import MutableBool
 # Constants for server and worker configuration
@@ -18,6 +18,9 @@ DEFAULT_WORKER_PORT = 63579
 SINGLE_FILE_MODE: bool = os.environ.get("SINGLE_FILE_MODE", "0") == "1"
+OFFLOAD_TO_WORKER = MutableBool(True)
 def parse_args():
     """Parse command line arguments"""
     parser = argparse.ArgumentParser(description="Flowfile Backend Server")
@@ -79,7 +82,6 @@ args = parse_args()
 SERVER_HOST = args.host if args.host is not None else DEFAULT_SERVER_HOST
 SERVER_PORT = args.port if args.port is not None else DEFAULT_SERVER_PORT
 WORKER_PORT = args.worker_port if args.worker_port is not None else int(os.getenv("WORKER_PORT", DEFAULT_WORKER_PORT))
-# Worker configuration
 WORKER_HOST = os.getenv("WORKER_HOST", "0.0.0.0" if platform.system() != "Windows" else "127.0.0.1")
 config = Config(".env")

flowfile_core/configs/utils.py ADDED Viewed

@@ -0,0 +1,18 @@
+from dataclasses import dataclass
+@dataclass
+class MutableBool:
+    value: bool
+    def __bool__(self) -> bool:
+        """Allow direct boolean evaluation"""
+        return self.value
+    def __eq__(self, other) -> bool:
+        """Allow equality comparison with booleans"""
+        if isinstance(other, bool):
+            return self.value == other
+        elif isinstance(other, MutableBool):
+            return self.value == other.value
+        return NotImplemented

flowfile_core/flowfile/FlowfileFlow.py CHANGED Viewed

@@ -2,6 +2,8 @@ import datetime
 import pickle
 import polars as pl
 import fastexcel
+import copy
 from fastapi.exceptions import HTTPException
 from time import time
 from functools import partial
@@ -13,7 +15,7 @@ from flowfile_core.configs import logger
 from flowfile_core.configs.flow_logger import FlowLogger
 from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
 from flowfile_core.flowfile.sources.external_sources.airbyte_sources.settings import airbyte_settings_from_config
-from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import type_to_polars_str, FlowfileColumn
+from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
 from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.settings_validator import (calculate_fuzzy_match_schema,
                                                                                        pre_calculate_pivot_schema)
 from flowfile_core.utils.arrow_reader import get_read_top_n
@@ -23,7 +25,7 @@ from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_x
 from flowfile_core.flowfile.sources import external_sources
 from flowfile_core.schemas import input_schema, schemas, transform_schema
 from flowfile_core.schemas.output_model import TableExample, NodeData, NodeResult, RunInformation
-from flowfile_core.flowfile.utils import snake_case_to_camel_case
+from flowfile_core.flowfile.utils import snake_case_to_camel_case, _handle_raw_data
 from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
 from flowfile_core.flowfile.flow_node.flow_node import FlowNode
 from flowfile_core.flowfile.util.execution_orderer import determine_execution_order
@@ -32,7 +34,7 @@ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_op
                                                                                                  ExternalDatabaseFetcher,
                                                                                                  ExternalDatabaseWriter,
                                                                                                  ExternalDfFetcher)
-from flowfile_core.secrets.secrets import get_encrypted_secret, decrypt_secret
+from flowfile_core.secret_manager.secret_manager import get_encrypted_secret, decrypt_secret
 from flowfile_core.flowfile.sources.external_sources.sql_source import utils as sql_utils, models as sql_models
 from flowfile_core.flowfile.sources.external_sources.sql_source.sql_source import SqlSource, BaseSqlSource
 from flowfile_core.flowfile.database_connection_manager.db_connections import get_local_database_connection
@@ -203,28 +205,20 @@ class FlowGraph:
         sample_size: int = 10000
         def analysis_preparation(flowfile_table: FlowDataEngine):
-            if flowfile_table.number_of_records<0:
-                number_of_records = ExternalDfFetcher(
-                    lf=flowfile_table.data_frame,
-                    operation_type="calculate_number_of_records",
-                    flow_id=self.flow_id,
-                    node_id=node.node_id,
-                ).result
+            if flowfile_table.number_of_records <= 0:
+                number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
             else:
                 number_of_records = flowfile_table.number_of_records
             if number_of_records > sample_size:
                 flowfile_table = flowfile_table.get_sample(sample_size, random=True)
             external_sampler = ExternalDfFetcher(
                 lf=flowfile_table.data_frame,
-                file_ref=node.hash,
+                file_ref="__gf_walker"+node.hash,
                 wait_on_completion=True,
                 node_id=node.node_id,
                 flow_id=self.flow_id,
             )
-            node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref, 10000)
+            node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref)
             return flowfile_table
         def schema_callback():
@@ -439,11 +433,11 @@ class FlowGraph:
     def add_formula(self, function_settings: input_schema.NodeFormula):
         error = ""
-        if function_settings.function.field.data_type is not None:
-            output_type = type_to_polars_str(function_settings.function.field.data_type)
+        if function_settings.function.field.data_type not in (None, "Auto"):
+            output_type = cast_str_to_polars_type(function_settings.function.field.data_type)
         else:
             output_type = None
-        if output_type is not None:
+        if output_type not in (None, "Auto"):
             new_col = [FlowfileColumn.from_input(column_name=function_settings.function.field.name,
                                                  data_type=str(output_type))]
         else:
@@ -485,7 +479,8 @@ class FlowGraph:
                            function=_func,
                            input_columns=[],
                            node_type='cross_join',
-                           setting_input=cross_join_settings)
+                           setting_input=cross_join_settings,
+                           input_node_ids=cross_join_settings.depending_on_ids)
         return self
     def add_join(self, join_settings: input_schema.NodeJoin) -> "FlowGraph":
@@ -587,6 +582,8 @@ class FlowGraph:
             input_cols = set(f.name for f in table.schema)
             ids_to_remove = []
             for i, select_col in enumerate(select_cols):
+                if select_col.data_type is None:
+                    select_col.data_type = table.get_schema_column(select_col.old_name).data_type
                 if select_col.old_name not in input_cols:
                     select_col.is_available = False
                     if not select_col.keep:
@@ -900,9 +897,6 @@ class FlowGraph:
         if external_source_input.source_settings.fields and len(external_source_input.source_settings.fields) > 0:
             logger.info('Using provided schema in the node')
-    def add_google_sheet(self, external_source_input: input_schema.NodeExternalSource):
-        logger.info('Adding google sheet reader')
-        self.add_external_source(external_source_input)
     def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
         logger.info('Adding sql source')
@@ -1044,11 +1038,10 @@ class FlowGraph:
         return self
     def add_datasource(self, input_file: input_schema.NodeDatasource | input_schema.NodeManualInput):
         if isinstance(input_file, input_schema.NodeManualInput):
-            input_data = FlowDataEngine(input_file.raw_data)
+            _handle_raw_data(input_file)
+            input_data = FlowDataEngine(input_file.raw_data_format)
             ref = 'manual_input'
         else:
             input_data = FlowDataEngine(path_ref=input_file.file_ref)
             ref = 'datasource'
@@ -1061,7 +1054,9 @@ class FlowGraph:
             if not input_file.node_id in set(start_node.node_id for start_node in self._flow_starts):
                 self._flow_starts.append(node)
         else:
+            input_data.collect()
             node = FlowNode(input_file.node_id, function=input_data,
                             setting_input=input_file,
                             name=ref, node_type=ref, parent_uuid=self.uuid)
@@ -1083,7 +1078,7 @@ class FlowGraph:
         self._output_cols += cols_available
     @property
-    def input_data_columns(self) -> List[str]:
+    def input_data_columns(self) -> List[str] | None:
         if self._input_cols:
             return list(set([col for col in self._input_cols if
                              col in [table_col.name for table_col in self._input_data.schema]]))
@@ -1102,7 +1097,7 @@ class FlowGraph:
         return implicit_starting_nodes
     @execution_mode.setter
-    def execution_mode(self, mode: str):
+    def execution_mode(self, mode: schemas.ExecutionModeLiteral):
         self.flow_settings.execution_mode = mode
     @property
@@ -1158,13 +1153,13 @@ class FlowGraph:
                         continue
                     node_result.success = node.results.errors is None
                     node_result.end_timestamp = time()
-                    node_result.run_time = node_result.end_timestamp - node_result.start_timestamp
+                    node_result.run_time = int(node_result.end_timestamp - node_result.start_timestamp)
                     node_result.is_running = False
                 except Exception as e:
                     node_result.error = 'Node did not run'
                     node_result.success = False
                     node_result.end_timestamp = time()
-                    node_result.run_time = node_result.end_timestamp - node_result.start_timestamp
+                    node_result.run_time = int(node_result.end_timestamp - node_result.start_timestamp)
                     node_result.is_running = False
                     node_logger.error(f'Error in node {node.node_id}: {e}')
                 if not node_result.success:
@@ -1352,6 +1347,66 @@ class FlowGraph:
         getattr(self, f"add_{node_type}")(combined_settings)
+def combine_flow_graphs(*flow_graphs: FlowGraph) -> FlowGraph:
+    """
+    Combine multiple flow graphs into a single graph, ensuring node IDs don't overlap.
+    Args:
+        *flow_graphs: Multiple FlowGraph instances to combine
+    Returns:
+        A new FlowGraph containing all nodes and edges from the input graphs with remapped IDs
+    Raises:
+        ValueError: If any flow_ids overlap
+    """
+    # Validate flow IDs are unique
+    _validate_unique_flow_ids(flow_graphs)
+    # Create ID mapping for all nodes
+    node_id_mapping = _create_node_id_mapping(flow_graphs)
+    # Remap and combine nodes
+    all_nodes = _remap_nodes(flow_graphs, node_id_mapping)
+    # Create a new combined flow graph
+    combined_flow_id = hash(tuple(fg.flow_id for fg in flow_graphs))
+    # return FlowGraph(flow_id=combined_flow_id, nodes=all_nodes, edges=all_edges)
+def _validate_unique_flow_ids(flow_graphs: Tuple[FlowGraph, ...]) -> None:
+    """Ensure all flow graphs have unique flow_ids."""
+    all_flow_ids = [fg.flow_id for fg in flow_graphs]
+    if len(all_flow_ids) != len(set(all_flow_ids)):
+        raise ValueError("Cannot combine overlapping graphs, make sure the graphs have a unique identifier")
+def _create_node_id_mapping(flow_graphs: Tuple[FlowGraph, ...]) -> Dict[int, Dict[int, int]]:
+    """Create a mapping from original node IDs to new unique node IDs."""
+    node_id_mapping: Dict[int, Dict[int, int]] = {}
+    next_node_id = 0
+    for fg in flow_graphs:
+        node_id_mapping[fg.flow_id] = {}
+        for node in fg.nodes:
+            node_id_mapping[fg.flow_id][node.node_id] = next_node_id
+            next_node_id += 1
+    return node_id_mapping
+def _remap_nodes(flow_graphs: Tuple[FlowGraph, ...],
+                 node_id_mapping: Dict[int, Dict[int, int]]) -> List:
+    """Create new nodes with remapped IDs."""
+    all_nodes = []
+    for fg in flow_graphs:
+        for node in fg.nodes:
+            new_node = copy.deepcopy(node)
+            new_node.node_id = node_id_mapping[fg.flow_id][node.node_id]
+            all_nodes.append(new_node)
+    return all_nodes
 def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
     """Combine excopy_nodeisting settings with new settings from a NodePromise."""
     copied_setting_input = deepcopy(setting_input)

flowfile_core/flowfile/database_connection_manager/db_connections.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from flowfile_core.schemas.input_schema import FullDatabaseConnection, FullDatabaseConnectionInterface
 from sqlalchemy.orm import Session
 from flowfile_core.database.models import DatabaseConnection as DBConnectionModel, Secret
-from flowfile_core.secrets.secrets import store_secret, SecretInput, decrypt_secret
+from flowfile_core.secret_manager.secret_manager import store_secret, SecretInput, decrypt_secret
 from flowfile_core.database.connection import get_db_context

flowfile_core/flowfile/flow_data_engine/flow_data_engine.py CHANGED Viewed

@@ -17,6 +17,7 @@ from pyarrow.parquet import ParquetFile
 # Local imports - Core
 from flowfile_core.configs import logger
 from flowfile_core.configs.flow_logger import NodeLogger
+from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
 from flowfile_core.schemas import (
     input_schema,
     transform_schema as transform_schemas
@@ -29,7 +30,7 @@ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
     FlowfileColumn,
     convert_stats_to_column_info
 )
-from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars
+from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
 from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
 from flowfile_core.flowfile.flow_data_engine.join import (
     verify_join_select_integrity,
@@ -109,7 +110,7 @@ class FlowDataEngine:
     # flow_id: int = None  # TODO: Implement flow_id
     def __init__(self,
-                 raw_data: Union[List[Dict], List[Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame] = None,
+                 raw_data: Union[List[Dict], List[Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
                  path_ref: str = None,
                  name: str = None,
                  optimize_memory: bool = True,
@@ -147,7 +148,10 @@ class FlowDataEngine:
     def _handle_raw_data(self, raw_data, number_of_records, optimize_memory):
         """Process different types of input data."""
-        if isinstance(raw_data, pl.DataFrame):
+        if isinstance(raw_data, input_schema.RawData):
+            self._handle_raw_data_format(raw_data)
+        elif isinstance(raw_data, pl.DataFrame):
             self._handle_polars_dataframe(raw_data, number_of_records)
         elif isinstance(raw_data, pl.LazyFrame):
             self._handle_polars_lazy_frame(raw_data, number_of_records, optimize_memory)
@@ -190,6 +194,20 @@ class FlowDataEngine:
             self.number_of_records = 1
             self.data_frame = pl.DataFrame([data])
+    def _handle_raw_data_format(self, raw_data: input_schema.RawData):
+        """Create a FlowDataEngine from a RawData object."""
+        flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
+        polars_schema = pl.Schema([(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
+                                   for flowfile_column in flowfile_schema])
+        try:
+            df = pl.DataFrame(raw_data.data, polars_schema)
+        except TypeError as e:
+            logger.warning(f"Could not parse the data with the schema:\n{e}")
+            df = pl.DataFrame(raw_data.data)
+        self.number_of_records = len(df)
+        self.data_frame = df.lazy()
+        self.lazy = True
     def _handle_list_input(self, data: List):
         """Handle list input."""
         number_of_records = len(data)
@@ -462,6 +480,9 @@ class FlowDataEngine:
             return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dicts()
         return self.data_frame.to_dicts()
+    def to_dict(self) -> Dict[str, List]:
+        return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
     @classmethod
     def create_from_external_source(cls, external_source: ExternalDataSource) -> "FlowDataEngine":
         """Create a FlowDataEngine from an external data source."""
@@ -484,7 +505,7 @@ class FlowDataEngine:
         """Create a FlowDataEngine from a schema definition."""
         pl_schema = []
         for i, flow_file_column in enumerate(schema):
-            pl_schema.append((flow_file_column.name, type_to_polars(flow_file_column.data_type)))
+            pl_schema.append((flow_file_column.name, cast_str_to_polars_type(flow_file_column.data_type)))
             schema[i].col_index = i
         df = pl.LazyFrame(schema=pl_schema)
         return cls(df, schema=schema, calculate_schema_stats=False, number_of_records=0)
@@ -824,7 +845,7 @@ class FlowDataEngine:
         Returns:
             FlowDataEngine: New instance with sampled data
         """
-        n_records = min(n_rows, self.number_of_records)
+        n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=True))
         logging.info(f'Getting sample of {n_rows} rows')
         if random:
@@ -1158,14 +1179,25 @@ class FlowDataEngine:
         self.number_of_records = 0
         self._lazy = True
-    def get_number_of_records(self, warn: bool = False, force_calculate: bool = False) -> int:
+    def _calculate_number_of_records_in_worker(self) -> int:
+        number_of_records = ExternalDfFetcher(
+            lf=self.data_frame,
+            operation_type="calculate_number_of_records",
+            flow_id=-1,
+            node_id=-1,
+            wait_on_completion=True
+        ).result
+        return number_of_records
+    def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
+                              calculate_in_worker_process: bool = False) -> int:
         """
         Get the total number of records in the DataFrame.
         Args:
             warn: Whether to warn about expensive operations
             force_calculate: Whether to force recalculation
+            calculate_in_worker_process: Whether to offload compute to the worker process
         Returns:
             int: Number of records
@@ -1174,22 +1206,24 @@ class FlowDataEngine:
         """
         if self.is_future and not self.is_collected:
             return -1
+        calculate_in_worker_process = False if not OFFLOAD_TO_WORKER.value else calculate_in_worker_process
         if self.number_of_records is None or self.number_of_records < 0 or force_calculate:
             if self._number_of_records_callback is not None:
                 self._number_of_records_callback(self)
             if self.lazy:
-                if warn:
-                    logger.warning('Calculating the number of records this can be expensive on a lazy frame')
-                try:
-                    self.number_of_records = self.data_frame.select(pl.len()).collect(
-                        engine="streaming" if self._streamable else "auto")[0, 0]
-                except Exception:
-                    raise Exception('Could not get number of records')
+                if calculate_in_worker_process:
+                    self.number_of_records = self._calculate_number_of_records_in_worker()
+                else:
+                    if warn:
+                        logger.warning('Calculating the number of records this can be expensive on a lazy frame')
+                    try:
+                        self.number_of_records = self.data_frame.select(pl.len()).collect(
+                            engine="streaming" if self._streamable else "auto")[0, 0]
+                    except Exception:
+                        raise ValueError('Could not get number of records')
             else:
                 self.number_of_records = self.data_frame.__len__()
         return self.number_of_records
     # Properties
@@ -1345,7 +1379,7 @@ class FlowDataEngine:
             FlowDataEngine: New instance with added column
         """
         expr = to_expr(func)
-        if output_data_type is not None:
+        if output_data_type not in (None, "Auto"):
             df = self.data_frame.with_columns(expr.cast(output_data_type).alias(col_name))
         else:
             df = self.data_frame.with_columns(expr.alias(col_name))
@@ -1518,4 +1552,7 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
         kwargs = {'input_df': flowfile_tables[0].data_frame}
     else:
         kwargs = {f'input_df_{i+1}': flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
-    return FlowDataEngine(polars_executable(**kwargs))
+    df = polars_executable(**kwargs)
+    if isinstance(df, pl.DataFrame):
+        logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
+    return FlowDataEngine(df)

flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from typing import Optional, Any, List, Dict, Literal
 from flowfile_core.schemas import input_schema
-from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars_str
+from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
 from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
 from polars import datatypes
 import polars as pl
@@ -9,6 +9,37 @@ import polars as pl
 DataTypeGroup = Literal['numeric', 'str', 'date']
+def convert_pl_type_to_string(pl_type: pl.DataType, inner: bool = False) -> str:
+    if isinstance(pl_type, pl.List):
+        inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
+        return f"pl.List({inner_str})"
+    elif isinstance(pl_type, pl.Array):
+        inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
+        return f"pl.Array({inner_str})"
+    elif isinstance(pl_type, pl.Decimal):
+        precision = pl_type.precision if hasattr(pl_type, 'precision') else None
+        scale = pl_type.scale if hasattr(pl_type, 'scale') else None
+        if precision is not None and scale is not None:
+            return f"pl.Decimal({precision}, {scale})"
+        elif precision is not None:
+            return f"pl.Decimal({precision})"
+        else:
+            return "pl.Decimal()"
+    elif isinstance(pl_type, pl.Struct):
+        # Handle Struct with field definitions
+        fields = []
+        if hasattr(pl_type, 'fields'):
+            for field in pl_type.fields:
+                field_name = field.name
+                field_type = convert_pl_type_to_string(field.dtype, inner=True)
+                fields.append(f'pl.Field("{field_name}", {field_type})')
+        field_str = ", ".join(fields)
+        return f"pl.Struct([{field_str}])"
+    else:
+        # For base types, we want the full pl.TypeName format
+        return str(pl_type.base_type()) if not inner else f"pl.{pl_type}"
 @dataclass
 class FlowfileColumn:
     column_name: str
@@ -28,7 +59,7 @@ class FlowfileColumn:
     __perc_unique: Optional[float]
     def __init__(self, polars_type: PlType):
-        self.data_type = str(polars_type.pl_datatype.base_type())
+        self.data_type = convert_pl_type_to_string(polars_type.pl_datatype)
         self.size = polars_type.count - polars_type.null_count
         self.max_value = polars_type.max
         self.min_value = polars_type.min
@@ -53,7 +84,7 @@ class FlowfileColumn:
     @classmethod
     def from_input(cls, column_name: str, data_type: str, **kwargs) -> "FlowfileColumn":
-        pl_type = type_to_polars_str(data_type)
+        pl_type = cast_str_to_polars_type(data_type)
         if pl_type is not None:
             data_type = pl_type
         return cls(PlType(column_name=column_name, pl_datatype=data_type, **kwargs))
@@ -129,12 +160,9 @@ class FlowfileColumn:
             return 'date'
     def get_polars_type(self) -> PlType:
-        if hasattr(datatypes, self.data_type):
-            pl_datatype = getattr(datatypes, self.data_type)
-        else:
-            pl_datatype = None
-        return PlType(pl_datatype=pl_datatype, **self.__dict__)
+        pl_datatype = cast_str_to_polars_type(self.data_type)
+        pl_type = PlType(pl_datatype=pl_datatype, **self.__dict__)
+        return pl_type
     def update_type_from_polars_type(self, pl_type: PlType):
         self.data_type = str(pl_type.pl_datatype.base_type())
@@ -142,3 +170,8 @@ class FlowfileColumn:
 def convert_stats_to_column_info(stats: List[Dict]) -> List[FlowfileColumn]:
     return [FlowfileColumn.create_from_polars_type(PlType(**c)) for c in stats]
+def convert_pl_schema_to_raw_data_format(pl_schema: pl.Schema) -> List[input_schema.MinimalFieldInfo]:
+    return [FlowfileColumn.create_from_polars_type(PlType(column_name=k, pl_datatype=v)).get_minimal_field_info()
+            for k, v in pl_schema.items()]

flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py CHANGED Viewed

@@ -18,10 +18,45 @@ dtype_to_pl = {
     'time': pl.Time,
 }
+def safe_eval_pl_type(type_string: str):
+    """
+    Safely evaluate a Polars type string with restricted namespace.
+    Only allows Polars types and basic Python literals.
+    """
+    # Define allowed names in the evaluation namespace
+    safe_dict = {
+        # Polars module and types
+        'pl': pl,
+        # Basic Python built-ins for literals
+        'int': int,
+        'str': str,
+        'float': float,
+        'bool': bool,
+        'list': list,
+        'dict': dict,
+        'tuple': tuple,
+        # Disable dangerous built-ins
+        '__builtins__': {},
+    }
+    try:
+        return eval(type_string, safe_dict, {})
+    except Exception as e:
+        raise ValueError(f"Failed to safely evaluate type string '{type_string}': {e}")
 dtype_to_pl_str = {k: v.__name__ for k, v in dtype_to_pl.items()}
-def type_to_polars(dtype: str):
+def get_polars_type(dtype: str):
+    if 'pl.' in dtype:
+        try:
+            return safe_eval_pl_type(dtype)
+        except Exception as e:
+            return pl.String
     pl_datetype = dtype_to_pl.get(dtype.lower())
     if pl_datetype is not None:
         return pl_datetype
@@ -31,6 +66,10 @@ def type_to_polars(dtype: str):
         return pl.String
-def type_to_polars_str(dtype: str) -> pl.DataType:
-    return type_to_polars(dtype)()
+def cast_str_to_polars_type(dtype: str) -> pl.DataType:
+    pl_type = get_polars_type(dtype)
+    if hasattr(pl_type, '__call__'):
+        return pl_type()
+    else:
+        return pl_type

flowfile_core/flowfile/flow_data_engine/polars_code_parser.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Dict, Any, Callable
 import textwrap
 import ast
 import time
+from io import BytesIO
 def remove_comments_and_docstrings(source: str) -> str:
@@ -126,6 +127,37 @@ class PolarsCodeParser:
             'col': pl.col,
             'lit': pl.lit,
             'expr': pl.expr,
+            # Polars datatypes - added directly
+            'Int8': pl.Int8,
+            'Int16': pl.Int16,
+            'Int32': pl.Int32,
+            'Int64': pl.Int64,
+            'Int128': pl.Int128,
+            'UInt8': pl.UInt8,
+            'UInt16': pl.UInt16,
+            'UInt32': pl.UInt32,
+            'UInt64': pl.UInt64,
+            'Float32': pl.Float32,
+            'Float64': pl.Float64,
+            'Boolean': pl.Boolean,
+            'String': pl.String,
+            'Utf8': pl.Utf8,
+            'Binary': pl.Binary,
+            'Null': pl.Null,
+            'List': pl.List,
+            'Array': pl.Array,
+            'Struct': pl.Struct,
+            'Object': pl.Object,
+            'Date': pl.Date,
+            'Time': pl.Time,
+            'Datetime': pl.Datetime,
+            'Duration': pl.Duration,
+            'Categorical': pl.Categorical,
+            'Decimal': pl.Decimal,
+            'Enum': pl.Enum,
+            'Unknown': pl.Unknown,
             # Basic Python built-ins
             'print': print,
             'len': len,
@@ -142,7 +174,8 @@ class PolarsCodeParser:
             'True': True,
             'False': False,
             'None': None,
-            'time': time
+            'time': time,
+            'BytesIO': BytesIO
         }
     @staticmethod
@@ -225,7 +258,6 @@ class PolarsCodeParser:
         # Wrap the code in a function
         wrapped_code = self._wrap_in_function(code, num_inputs)
         try:
             # Create namespace for execution
             local_namespace: Dict[str, Any] = {}

Flowfile 0.3.1.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

Potentially problematic release.

Flowfile 0.3.1.2py3-none-any.whl → 0.3.3py3-none-any.whl