PyPI - Flowfile - Versions diffs - 0.3.1.1__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

Flowfile 0.3.1.1py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

flowfile_core/flowfile/FlowfileFlow.py CHANGED Viewed

@@ -2,6 +2,8 @@ import datetime
 import pickle
 import polars as pl
 import fastexcel
+import copy
 from fastapi.exceptions import HTTPException
 from time import time
 from functools import partial
@@ -203,8 +205,7 @@ class FlowGraph:
         sample_size: int = 10000
         def analysis_preparation(flowfile_table: FlowDataEngine):
-            if flowfile_table.number_of_records<0:
+            if flowfile_table.number_of_records < 0:
                 number_of_records = ExternalDfFetcher(
                     lf=flowfile_table.data_frame,
@@ -219,7 +220,7 @@ class FlowGraph:
             external_sampler = ExternalDfFetcher(
                 lf=flowfile_table.data_frame,
-                file_ref=node.hash,
+                file_ref="__gf_walker"+node.hash,
                 wait_on_completion=True,
                 node_id=node.node_id,
                 flow_id=self.flow_id,
@@ -439,11 +440,11 @@ class FlowGraph:
     def add_formula(self, function_settings: input_schema.NodeFormula):
         error = ""
-        if function_settings.function.field.data_type is not None:
+        if function_settings.function.field.data_type not in (None, "Auto"):
             output_type = type_to_polars_str(function_settings.function.field.data_type)
         else:
             output_type = None
-        if output_type is not None:
+        if output_type not in (None, "Auto"):
             new_col = [FlowfileColumn.from_input(column_name=function_settings.function.field.name,
                                                  data_type=str(output_type))]
         else:
@@ -587,6 +588,8 @@ class FlowGraph:
             input_cols = set(f.name for f in table.schema)
             ids_to_remove = []
             for i, select_col in enumerate(select_cols):
+                if select_col.data_type is None:
+                    select_col.data_type = table.get_schema_column(select_col.old_name).data_type
                 if select_col.old_name not in input_cols:
                     select_col.is_available = False
                     if not select_col.keep:
@@ -900,9 +903,6 @@ class FlowGraph:
         if external_source_input.source_settings.fields and len(external_source_input.source_settings.fields) > 0:
             logger.info('Using provided schema in the node')
-    def add_google_sheet(self, external_source_input: input_schema.NodeExternalSource):
-        logger.info('Adding google sheet reader')
-        self.add_external_source(external_source_input)
     def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
         logger.info('Adding sql source')
@@ -1083,7 +1083,7 @@ class FlowGraph:
         self._output_cols += cols_available
     @property
-    def input_data_columns(self) -> List[str]:
+    def input_data_columns(self) -> List[str] | None:
         if self._input_cols:
             return list(set([col for col in self._input_cols if
                              col in [table_col.name for table_col in self._input_data.schema]]))
@@ -1102,7 +1102,7 @@ class FlowGraph:
         return implicit_starting_nodes
     @execution_mode.setter
-    def execution_mode(self, mode: str):
+    def execution_mode(self, mode: schemas.ExecutionModeLiteral):
         self.flow_settings.execution_mode = mode
     @property
@@ -1158,13 +1158,13 @@ class FlowGraph:
                         continue
                     node_result.success = node.results.errors is None
                     node_result.end_timestamp = time()
-                    node_result.run_time = node_result.end_timestamp - node_result.start_timestamp
+                    node_result.run_time = int(node_result.end_timestamp - node_result.start_timestamp)
                     node_result.is_running = False
                 except Exception as e:
                     node_result.error = 'Node did not run'
                     node_result.success = False
                     node_result.end_timestamp = time()
-                    node_result.run_time = node_result.end_timestamp - node_result.start_timestamp
+                    node_result.run_time = int(node_result.end_timestamp - node_result.start_timestamp)
                     node_result.is_running = False
                     node_logger.error(f'Error in node {node.node_id}: {e}')
                 if not node_result.success:
@@ -1352,6 +1352,66 @@ class FlowGraph:
         getattr(self, f"add_{node_type}")(combined_settings)
+def combine_flow_graphs(*flow_graphs: FlowGraph) -> FlowGraph:
+    """
+    Combine multiple flow graphs into a single graph, ensuring node IDs don't overlap.
+    Args:
+        *flow_graphs: Multiple FlowGraph instances to combine
+    Returns:
+        A new FlowGraph containing all nodes and edges from the input graphs with remapped IDs
+    Raises:
+        ValueError: If any flow_ids overlap
+    """
+    # Validate flow IDs are unique
+    _validate_unique_flow_ids(flow_graphs)
+    # Create ID mapping for all nodes
+    node_id_mapping = _create_node_id_mapping(flow_graphs)
+    # Remap and combine nodes
+    all_nodes = _remap_nodes(flow_graphs, node_id_mapping)
+    # Create a new combined flow graph
+    combined_flow_id = hash(tuple(fg.flow_id for fg in flow_graphs))
+    # return FlowGraph(flow_id=combined_flow_id, nodes=all_nodes, edges=all_edges)
+def _validate_unique_flow_ids(flow_graphs: Tuple[FlowGraph, ...]) -> None:
+    """Ensure all flow graphs have unique flow_ids."""
+    all_flow_ids = [fg.flow_id for fg in flow_graphs]
+    if len(all_flow_ids) != len(set(all_flow_ids)):
+        raise ValueError("Cannot combine overlapping graphs, make sure the graphs have a unique identifier")
+def _create_node_id_mapping(flow_graphs: Tuple[FlowGraph, ...]) -> Dict[int, Dict[int, int]]:
+    """Create a mapping from original node IDs to new unique node IDs."""
+    node_id_mapping: Dict[int, Dict[int, int]] = {}
+    next_node_id = 0
+    for fg in flow_graphs:
+        node_id_mapping[fg.flow_id] = {}
+        for node in fg.nodes:
+            node_id_mapping[fg.flow_id][node.node_id] = next_node_id
+            next_node_id += 1
+    return node_id_mapping
+def _remap_nodes(flow_graphs: Tuple[FlowGraph, ...],
+                 node_id_mapping: Dict[int, Dict[int, int]]) -> List:
+    """Create new nodes with remapped IDs."""
+    all_nodes = []
+    for fg in flow_graphs:
+        for node in fg.nodes:
+            new_node = copy.deepcopy(node)
+            new_node.node_id = node_id_mapping[fg.flow_id][node.node_id]
+            all_nodes.append(new_node)
+    return all_nodes
 def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
     """Combine excopy_nodeisting settings with new settings from a NodePromise."""
     copied_setting_input = deepcopy(setting_input)

flowfile_core/flowfile/flow_data_engine/flow_data_engine.py CHANGED Viewed

@@ -1345,7 +1345,7 @@ class FlowDataEngine:
             FlowDataEngine: New instance with added column
         """
         expr = to_expr(func)
-        if output_data_type is not None:
+        if output_data_type not in (None, "Auto"):
             df = self.data_frame.with_columns(expr.cast(output_data_type).alias(col_name))
         else:
             df = self.data_frame.with_columns(expr.alias(col_name))

flowfile_core/flowfile/flow_data_engine/polars_code_parser.py CHANGED Viewed

@@ -126,6 +126,37 @@ class PolarsCodeParser:
             'col': pl.col,
             'lit': pl.lit,
             'expr': pl.expr,
+            # Polars datatypes - added directly
+            'Int8': pl.Int8,
+            'Int16': pl.Int16,
+            'Int32': pl.Int32,
+            'Int64': pl.Int64,
+            'Int128': pl.Int128,
+            'UInt8': pl.UInt8,
+            'UInt16': pl.UInt16,
+            'UInt32': pl.UInt32,
+            'UInt64': pl.UInt64,
+            'Float32': pl.Float32,
+            'Float64': pl.Float64,
+            'Boolean': pl.Boolean,
+            'String': pl.String,
+            'Utf8': pl.Utf8,
+            'Binary': pl.Binary,
+            'Null': pl.Null,
+            'List': pl.List,
+            'Array': pl.Array,
+            'Struct': pl.Struct,
+            'Object': pl.Object,
+            'Date': pl.Date,
+            'Time': pl.Time,
+            'Datetime': pl.Datetime,
+            'Duration': pl.Duration,
+            'Categorical': pl.Categorical,
+            'Decimal': pl.Decimal,
+            'Enum': pl.Enum,
+            'Unknown': pl.Unknown,
             # Basic Python built-ins
             'print': print,
             'len': len,
@@ -142,7 +173,7 @@ class PolarsCodeParser:
             'True': True,
             'False': False,
             'None': None,
-            'time': time
+            'time': time,
         }
     @staticmethod

flowfile_core/flowfile/flow_graph_utils.py ADDED Viewed

@@ -0,0 +1,320 @@
+from typing import Dict, Tuple, Optional, List, Set, Callable
+from copy import deepcopy
+from flowfile_core.schemas import input_schema, schemas
+from flowfile_core.flowfile.FlowfileFlow import FlowGraph, add_connection
+def combine_flow_graphs_with_mapping(
+        *flow_graphs: FlowGraph,
+        target_flow_id: Optional[int] = None) -> Tuple[FlowGraph, Dict[Tuple[int, int], int]]:
+    # Validate input parameters
+    _validate_input(flow_graphs)
+    # Generate a unique flow ID if not provided
+    if target_flow_id is None:
+        target_flow_id = _generate_unique_flow_id(flow_graphs)
+    flow_settings = _create_flow_settings(flow_graphs[0], target_flow_id)
+    combined_graph = FlowGraph(flow_id=target_flow_id, flow_settings=flow_settings)
+    node_id_mapping = _create_node_id_mapping(flow_graphs)
+    _add_nodes_to_combined_graph(flow_graphs, combined_graph, node_id_mapping, target_flow_id)
+    _add_connections_to_combined_graph(flow_graphs, combined_graph, node_id_mapping)
+    return combined_graph, node_id_mapping
+def combine_flow_graphs(*flow_graphs: FlowGraph, target_flow_id: Optional[int] = None) -> FlowGraph:
+    """
+    Combine multiple flow graphs into a single graph, ensuring node IDs don't overlap.
+    Args:
+        *flow_graphs: Multiple FlowGraph instances to combine
+        target_flow_id: Optional ID for the new combined graph. If None, a new ID will be generated.
+    Returns:
+        A new FlowGraph containing all nodes and edges from the input graphs with remapped IDs
+    Raises:
+        ValueError: If no flow graphs are provided
+    """
+    # Validate input parameters
+    _validate_input(flow_graphs)
+    # Generate a unique flow ID if not provided
+    if target_flow_id is None:
+        target_flow_id = _generate_unique_flow_id(flow_graphs)
+    flow_settings = _create_flow_settings(flow_graphs[0], target_flow_id)
+    combined_graph = FlowGraph(flow_id=target_flow_id, flow_settings=flow_settings)
+    node_id_mapping = _create_node_id_mapping(flow_graphs)
+    _add_nodes_to_combined_graph(flow_graphs, combined_graph, node_id_mapping, target_flow_id)
+    _add_connections_to_combined_graph(flow_graphs, combined_graph, node_id_mapping)
+    return combined_graph
+def _validate_input(flow_graphs: Tuple[FlowGraph, ...]) -> None:
+    """
+    Validate input parameters.
+    Args:
+        flow_graphs: Flow graphs to validate
+    Raises:
+        ValueError: If validation fails
+    """
+    if not flow_graphs:
+        raise ValueError("At least one FlowGraph must be provided")
+    # Check for duplicate flow IDs
+    flow_ids = [fg.flow_id for fg in flow_graphs]
+    if len(flow_ids) != len(set(flow_ids)):
+        raise ValueError("Cannot combine flows with duplicate flow IDs")
+def _generate_unique_flow_id(flow_graphs: Tuple[FlowGraph, ...]) -> int:
+    """
+    Generate a unique flow ID based on the input flow graphs.
+    Args:
+        flow_graphs: Flow graphs to generate ID from
+    Returns:
+        int: A new unique flow ID
+    """
+    return abs(hash(tuple(fg.flow_id for fg in flow_graphs))) % 1000000
+def _create_flow_settings(base_flow_graph: FlowGraph, target_flow_id: int) -> schemas.FlowSettings:
+    """
+    Create flow settings for the combined graph based on an existing graph.
+    Args:
+        base_flow_graph: Flow graph to base settings on
+        target_flow_id: The new flow ID
+    Returns:
+        schemas.FlowSettings: Flow settings for the combined graph
+    """
+    flow_settings = deepcopy(base_flow_graph.flow_settings)
+    flow_settings.flow_id = target_flow_id
+    flow_settings.name = f"Combined Flow {target_flow_id}"
+    return flow_settings
+def _create_node_id_mapping(flow_graphs: Tuple[FlowGraph, ...]) -> Dict[Tuple[int, int], int]:
+    """
+    Create a mapping from (flow_id, original_node_id) to new unique node IDs.
+    Args:
+        flow_graphs: Flow graphs to process
+    Returns:
+        Dict: Mapping from (flow_id, node_id) to new node ID
+    """
+    node_id_mapping = {}
+    next_node_id = _get_next_available_node_id(flow_graphs)
+    for fg in flow_graphs:
+        for node in fg.nodes:
+            node_id_mapping[(fg.flow_id, node.node_id)] = next_node_id
+            next_node_id += 1
+    return node_id_mapping
+def _get_next_available_node_id(flow_graphs: Tuple[FlowGraph, ...]) -> int:
+    """
+    Find the next available node ID.
+    Args:
+        flow_graphs: Flow graphs to examine
+    Returns:
+        int: Next available node ID
+    """
+    max_id = 0
+    for fg in flow_graphs:
+        for node in fg.nodes:
+            max_id = max(max_id, node.node_id)
+    return max_id + 1
+def _add_nodes_to_combined_graph(
+        flow_graphs: Tuple[FlowGraph, ...],
+        combined_graph: FlowGraph,
+        node_id_mapping: Dict[Tuple[int, int], int],
+        target_flow_id: int
+) -> None:
+    """
+    Add all nodes from source graphs to the combined graph.
+    Args:
+        flow_graphs: Source flow graphs
+        combined_graph: Target combined graph
+        node_id_mapping: Mapping of node IDs
+        target_flow_id: Target flow ID
+    """
+    processed_nodes = set()
+    for fg in flow_graphs:
+        for node in fg.nodes:
+            # Skip if already processed
+            if (fg.flow_id, node.node_id) in processed_nodes:
+                continue
+            # Generate new node ID
+            new_node_id = node_id_mapping[(fg.flow_id, node.node_id)]
+            # Create and update setting input
+            setting_input = _create_updated_setting_input(
+                node.setting_input,
+                new_node_id,
+                target_flow_id,
+                fg.flow_id,
+                node_id_mapping
+            )
+            # Add node to combined graph
+            _add_node_to_graph(combined_graph, new_node_id, target_flow_id, node.node_type, setting_input)
+            processed_nodes.add((fg.flow_id, node.node_id))
+def _create_updated_setting_input(
+        original_setting_input: any,
+        new_node_id: int,
+        target_flow_id: int,
+        source_flow_id: int,
+        node_id_mapping: Dict[Tuple[int, int], int]
+) -> any:
+    """
+    Create an updated setting input with new node and flow IDs.
+    Args:
+        original_setting_input: Original setting input
+        new_node_id: New node ID
+        target_flow_id: Target flow ID
+        source_flow_id: Source flow ID
+        node_id_mapping: Mapping of node IDs
+    Returns:
+        Updated setting input
+    """
+    setting_input = deepcopy(original_setting_input)
+    # Update node ID
+    if hasattr(setting_input, 'node_id'):
+        setting_input.node_id = new_node_id
+    # Update flow ID
+    if hasattr(setting_input, 'flow_id'):
+        setting_input.flow_id = target_flow_id
+    # Update depending_on_id if present
+    if hasattr(setting_input, 'depending_on_id') and setting_input.depending_on_id != -1:
+        orig_depending_id = setting_input.depending_on_id
+        setting_input.depending_on_id = node_id_mapping.get((source_flow_id, orig_depending_id), -1)
+    # Update depending_on_ids list if present
+    if hasattr(setting_input, 'depending_on_ids'):
+        setting_input.depending_on_ids = [
+            node_id_mapping.get((source_flow_id, dep_id), -1)
+            for dep_id in setting_input.depending_on_ids
+            if dep_id != -1
+        ]
+    return setting_input
+def _add_node_to_graph(
+        graph: FlowGraph,
+        node_id: int,
+        flow_id: int,
+        node_type: str,
+        setting_input: any
+) -> None:
+    """
+    Add a node to the graph.
+    Args:
+        graph: Target graph
+        node_id: Node ID
+        flow_id: Flow ID
+        node_type: Node type
+        setting_input: Setting input
+    """
+    # Add node promise to graph
+    node_promise = input_schema.NodePromise(
+        node_id=node_id,
+        flow_id=flow_id,
+        node_type=node_type,
+        is_setup=True,
+        pos_x=getattr(setting_input, 'pos_x', 0),
+        pos_y=getattr(setting_input, 'pos_y', 0),
+        description=getattr(setting_input, 'description', '')
+    )
+    graph.add_node_promise(node_promise)
+    # Get node type-specific add method
+    add_method_name = f"add_{node_type}"
+    if hasattr(graph, add_method_name):
+        add_method = getattr(graph, add_method_name)
+        add_method(setting_input)
+def _add_connections_to_combined_graph(
+        flow_graphs: Tuple[FlowGraph, ...],
+        combined_graph: FlowGraph,
+        node_id_mapping: Dict[Tuple[int, int], int]
+) -> None:
+    """
+    Add all connections from source graphs to the combined graph.
+    Args:
+        flow_graphs: Source flow graphs
+        combined_graph: Target combined graph
+        node_id_mapping: Mapping of node IDs
+    """
+    for fg in flow_graphs:
+        for connection in fg.node_connections:
+            source_id, target_id = connection
+            new_source_id = node_id_mapping.get((fg.flow_id, source_id))
+            new_target_id = node_id_mapping.get((fg.flow_id, target_id))
+            if new_source_id is not None and new_target_id is not None:
+                input_type = _determine_connection_input_type(fg, source_id, target_id)
+                # Create connection in combined graph
+                node_connection = input_schema.NodeConnection.create_from_simple_input(
+                    from_id=new_source_id,
+                    to_id=new_target_id,
+                    input_type=input_type
+                )
+                add_connection(combined_graph, node_connection)
+def _determine_connection_input_type(
+        flow_graph: FlowGraph,
+        source_id: int,
+        target_id: int
+) -> str:
+    """
+    Determine the input type for a connection.
+    Args:
+        flow_graph: Source flow graph
+        source_id: Source node ID
+        target_id: Target node ID
+    Returns:
+        str: Input type (main, left, right)
+    """
+    from_node = flow_graph.get_node(source_id)
+    to_node = flow_graph.get_node(target_id)
+    if from_node and to_node:
+        input_types = to_node.get_input_type(from_node.node_id)
+        if input_types:
+            return input_types[0]
+    return "main"

flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py CHANGED Viewed

	@@ -1,2 +1 @@
1	- from flowfile_core.flowfile.sources.external_sources.custom_external_sources import google_sheet
2 1	from flowfile_core.flowfile.sources.external_sources.custom_external_sources import sample_users

flowfile_core/schemas/input_schema.py CHANGED Viewed

@@ -15,6 +15,7 @@ InputConnectionClass = Literal['input-0', 'input-1', 'input-2', 'input-3', 'inpu
 InputType = Literal["main", "left", "right"]
 class NewDirectory(BaseModel):
     source_path: str
     dir_name: str
@@ -341,15 +342,6 @@ class SampleUsers(ExternalSource):
     size: int = 100
-class GoogleSheet(ExternalSource):
-    GOOGLE_SHEET: bool
-    class_name: str = "google_sheet"
-    access_token: SecretStr = None
-    sheet_id: str
-    worksheet_name: str
-    sheet_name: str
 class AirbyteReader(AirbyteConfig):
     class_name: Optional[str] = "airbyte_reader"
     fields: Optional[List[MinimalFieldInfo]] = None
@@ -362,7 +354,7 @@ class AccessToken(BaseModel):
 class NodeExternalSource(NodeBase):
     identifier: str
-    source_settings: SampleUsers | GoogleSheet
+    source_settings: SampleUsers
 class NodeAirbyteReader(NodeExternalSource):

flowfile_frame/__init__.py CHANGED Viewed

@@ -23,7 +23,7 @@ from flowfile_frame.selectors import (  # noqa: F401
 # File I/O
 from flowfile_frame.flow_frame import (  # noqa: F401
-    read_csv, read_parquet, from_dict, concat
+    read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet
 )
 from polars.datatypes import (  # noqa: F401

Flowfile 0.3.1.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

Flowfile 0.3.1.1py3-none-any.whl → 0.3.2py3-none-any.whl