PyPI - Flowfile - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.3.1__py3-none-any.whl - Mend

Flowfile 0.3.2py3-none-any.whl → 0.3.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

flowfile/__init__.py +3 -2
flowfile/web/__init__.py +3 -0
{flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/METADATA +4 -3
{flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/RECORD +46 -35
flowfile_core/configs/__init__.py +15 -4
flowfile_core/configs/settings.py +5 -3
flowfile_core/configs/utils.py +18 -0
flowfile_core/flowfile/FlowfileFlow.py +13 -18
flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +54 -17
flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +2 -1
flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
flowfile_core/flowfile/flow_node/flow_node.py +2 -1
flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
flowfile_core/flowfile/utils.py +34 -3
flowfile_core/main.py +2 -3
flowfile_core/routes/secrets.py +1 -1
flowfile_core/schemas/input_schema.py +10 -4
flowfile_core/schemas/transform_schema.py +25 -47
flowfile_frame/__init__.py +11 -4
flowfile_frame/adding_expr.py +280 -0
flowfile_frame/config.py +9 -0
flowfile_frame/expr.py +301 -83
flowfile_frame/expr.pyi +2174 -0
flowfile_frame/expr_name.py +258 -0
flowfile_frame/flow_frame.py +584 -1002
flowfile_frame/flow_frame.pyi +368 -0
flowfile_frame/flow_frame_methods.py +617 -0
flowfile_frame/group_frame.py +89 -42
flowfile_frame/join.py +1 -2
flowfile_frame/lazy.py +704 -0
flowfile_frame/lazy_methods.py +201 -0
flowfile_frame/list_name_space.py +324 -0
flowfile_frame/selectors.py +3 -0
flowfile_frame/series.py +70 -0
flowfile_frame/utils.py +80 -4
{flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/LICENSE +0 -0
{flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/WHEEL +0 -0
{flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/entry_points.txt +0 -0
/flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
/flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0

flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from typing import Optional, Any, List, Dict, Literal
 from flowfile_core.schemas import input_schema
-from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars_str
+from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
 from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
 from polars import datatypes
 import polars as pl
@@ -9,6 +9,37 @@ import polars as pl
 DataTypeGroup = Literal['numeric', 'str', 'date']
+def convert_pl_type_to_string(pl_type: pl.DataType, inner: bool = False) -> str:
+    if isinstance(pl_type, pl.List):
+        inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
+        return f"pl.List({inner_str})"
+    elif isinstance(pl_type, pl.Array):
+        inner_str = convert_pl_type_to_string(pl_type.inner, inner=True)
+        return f"pl.Array({inner_str})"
+    elif isinstance(pl_type, pl.Decimal):
+        precision = pl_type.precision if hasattr(pl_type, 'precision') else None
+        scale = pl_type.scale if hasattr(pl_type, 'scale') else None
+        if precision is not None and scale is not None:
+            return f"pl.Decimal({precision}, {scale})"
+        elif precision is not None:
+            return f"pl.Decimal({precision})"
+        else:
+            return "pl.Decimal()"
+    elif isinstance(pl_type, pl.Struct):
+        # Handle Struct with field definitions
+        fields = []
+        if hasattr(pl_type, 'fields'):
+            for field in pl_type.fields:
+                field_name = field.name
+                field_type = convert_pl_type_to_string(field.dtype, inner=True)
+                fields.append(f'pl.Field("{field_name}", {field_type})')
+        field_str = ", ".join(fields)
+        return f"pl.Struct([{field_str}])"
+    else:
+        # For base types, we want the full pl.TypeName format
+        return str(pl_type.base_type()) if not inner else f"pl.{pl_type}"
 @dataclass
 class FlowfileColumn:
     column_name: str
@@ -28,7 +59,7 @@ class FlowfileColumn:
     __perc_unique: Optional[float]
     def __init__(self, polars_type: PlType):
-        self.data_type = str(polars_type.pl_datatype.base_type())
+        self.data_type = convert_pl_type_to_string(polars_type.pl_datatype)
         self.size = polars_type.count - polars_type.null_count
         self.max_value = polars_type.max
         self.min_value = polars_type.min
@@ -53,7 +84,7 @@ class FlowfileColumn:
     @classmethod
     def from_input(cls, column_name: str, data_type: str, **kwargs) -> "FlowfileColumn":
-        pl_type = type_to_polars_str(data_type)
+        pl_type = cast_str_to_polars_type(data_type)
         if pl_type is not None:
             data_type = pl_type
         return cls(PlType(column_name=column_name, pl_datatype=data_type, **kwargs))
@@ -129,12 +160,9 @@ class FlowfileColumn:
             return 'date'
     def get_polars_type(self) -> PlType:
-        if hasattr(datatypes, self.data_type):
-            pl_datatype = getattr(datatypes, self.data_type)
-        else:
-            pl_datatype = None
-        return PlType(pl_datatype=pl_datatype, **self.__dict__)
+        pl_datatype = cast_str_to_polars_type(self.data_type)
+        pl_type = PlType(pl_datatype=pl_datatype, **self.__dict__)
+        return pl_type
     def update_type_from_polars_type(self, pl_type: PlType):
         self.data_type = str(pl_type.pl_datatype.base_type())
@@ -142,3 +170,8 @@ class FlowfileColumn:
 def convert_stats_to_column_info(stats: List[Dict]) -> List[FlowfileColumn]:
     return [FlowfileColumn.create_from_polars_type(PlType(**c)) for c in stats]
+def convert_pl_schema_to_raw_data_format(pl_schema: pl.Schema) -> List[input_schema.MinimalFieldInfo]:
+    return [FlowfileColumn.create_from_polars_type(PlType(column_name=k, pl_datatype=v)).get_minimal_field_info()
+            for k, v in pl_schema.items()]

flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py CHANGED Viewed

@@ -18,10 +18,45 @@ dtype_to_pl = {
     'time': pl.Time,
 }
+def safe_eval_pl_type(type_string: str):
+    """
+    Safely evaluate a Polars type string with restricted namespace.
+    Only allows Polars types and basic Python literals.
+    """
+    # Define allowed names in the evaluation namespace
+    safe_dict = {
+        # Polars module and types
+        'pl': pl,
+        # Basic Python built-ins for literals
+        'int': int,
+        'str': str,
+        'float': float,
+        'bool': bool,
+        'list': list,
+        'dict': dict,
+        'tuple': tuple,
+        # Disable dangerous built-ins
+        '__builtins__': {},
+    }
+    try:
+        return eval(type_string, safe_dict, {})
+    except Exception as e:
+        raise ValueError(f"Failed to safely evaluate type string '{type_string}': {e}")
 dtype_to_pl_str = {k: v.__name__ for k, v in dtype_to_pl.items()}
-def type_to_polars(dtype: str):
+def get_polars_type(dtype: str):
+    if 'pl.' in dtype:
+        try:
+            return safe_eval_pl_type(dtype)
+        except Exception as e:
+            return pl.String
     pl_datetype = dtype_to_pl.get(dtype.lower())
     if pl_datetype is not None:
         return pl_datetype
@@ -31,6 +66,10 @@ def type_to_polars(dtype: str):
         return pl.String
-def type_to_polars_str(dtype: str) -> pl.DataType:
-    return type_to_polars(dtype)()
+def cast_str_to_polars_type(dtype: str) -> pl.DataType:
+    pl_type = get_polars_type(dtype)
+    if hasattr(pl_type, '__call__'):
+        return pl_type()
+    else:
+        return pl_type

flowfile_core/flowfile/flow_data_engine/polars_code_parser.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Dict, Any, Callable
 import textwrap
 import ast
 import time
+from io import BytesIO
 def remove_comments_and_docstrings(source: str) -> str:
@@ -174,6 +175,7 @@ class PolarsCodeParser:
             'False': False,
             'None': None,
             'time': time,
+            'BytesIO': BytesIO
         }
     @staticmethod
@@ -256,7 +258,6 @@ class PolarsCodeParser:
         # Wrap the code in a function
         wrapped_code = self._wrap_in_function(code, num_inputs)
         try:
             # Create namespace for execution
             local_namespace: Dict[str, Any] = {}

flowfile_core/flowfile/flow_data_engine/sample_data.py CHANGED Viewed

@@ -1,14 +1,27 @@
 from faker import Faker
 from functools import partial
+from math import ceil
 from random import randint
 import polars as pl
 from typing import List, Dict, Any, Generator
-def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
+def create_fake_data(n_records: int = 1000, optimized: bool = True) -> pl.DataFrame:
+    """
+    Args:
+        n_records (): Number of records to return
+        optimized (): Indicator if creation should be optimized, will result in more identical rows when True
+    Returns:
+        pl.DataFrame
+    """
     fake = Faker()
-    selector = partial(randint,0)
-    min_range = partial(min, n_records)
+    selector = partial(randint, 0)
+    max_n_records = min(10_000, n_records) if optimized else n_records
+    min_range = partial(min, max_n_records)
     # Pre-generation of static data
     cities = [fake.city() for _ in range(min_range(7000))]
     companies = [fake.company() for _ in range(min_range(100_000))]
@@ -19,7 +32,7 @@ def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
     first_names = [fake.first_name() for _ in range(min_range(100_000))]
     last_names = [fake.last_name() for _ in range(min_range(50_000))]
     domain_names = [fake.domain_name() for _ in range(10)]
-    sales_data = [fake.random_int(0, 1000) for _ in range(n_records)]
+    sales_data = [fake.random_int(0, 1000) for _ in range(max_n_records)]
     def generate_name():
         return f"{first_names[selector(min_range(100_000))-1]} {last_names[selector(min_range(50_000))-1]}"
@@ -32,9 +45,8 @@ def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
     def generate_phone_number():
         return fake.phone_number()
     data = []
-    for i in range(n_records):
+    for i in range(max_n_records):
         name = generate_name()
         data.append(dict(
             ID=randint(1, 1000000),
@@ -47,8 +59,14 @@ def create_fake_data(n_records: int = 1000) -> pl.DataFrame:
             Work=companies[selector(min_range(100_000))-1],
             Zipcode=zipcodes[selector(min_range(200_000))-1],
             Country=countries[selector(min_range(50))-1],
-            sales_data=sales_data[selector(n_records)-1]
+            sales_data=sales_data[selector(max_n_records)-1]
         ))
+    if max_n_records < n_records:
+        n_duplicates: int = ceil(n_records / max_n_records)
+        output = []
+        for _ in range(n_duplicates):
+            output.extend(data)
+        data = output[:n_records]
     return pl.DataFrame(data)

flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py CHANGED Viewed

@@ -190,7 +190,7 @@ class BaseFetcher:
                 logger.info('Already running the fetching')
                 return
-            sleep_time = 1
+            sleep_time = .5
             self.running = True
             while not self.stop_event.is_set():
                 try:
@@ -205,7 +205,8 @@ class BaseFetcher:
                             break
                         elif status.status == 'Unknown Error':
                             self._handle_error(-1,
-                                               'There was an unknown error with the process, and the process got killed by the server')
+                                               'There was an unknown error with the process, '
+                                               'and the process got killed by the server')
                             break
                     else:
                         self._handle_error(2, r.text)
@@ -284,7 +285,7 @@ class ExternalDfFetcher(BaseFetcher):
     def __init__(self, flow_id: int, node_id: int | str, lf: pl.LazyFrame | pl.DataFrame, file_ref: str = None,
                  wait_on_completion: bool = True,
-                 operation_type: OperationType = 'store'):
+                 operation_type: OperationType = 'store', offload_to_worker: bool = True):
         super().__init__(file_ref=file_ref)
         lf = lf.lazy() if isinstance(lf, pl.DataFrame) else lf
         r = trigger_df_operation(lf=lf, file_ref=self.file_ref, operation_type=operation_type,

flowfile_core/flowfile/flow_data_engine/utils.py CHANGED Viewed

@@ -210,3 +210,4 @@ def match_order(l: List[str], ref: List[str]) -> List[str]:
         org_order = ref_order.get(v, float('inf'))
         order.append(org_order)
     return [v for _, v in sorted(zip(order, l))]

flowfile_core/flowfile/flow_node/flow_node.py CHANGED Viewed

@@ -146,6 +146,7 @@ class FlowNode:
         self.node_settings.renew_schema = True
         if hasattr(setting_input, 'cache_results'):
             self.node_settings.cache_results = setting_input.cache_results
         self.setting_input = setting_input
         self.results.errors = None
         self.add_lead_to_in_depend_source()
@@ -174,7 +175,7 @@ class FlowNode:
         self.set_node_information()
         if self.node_type == 'manual_input' and isinstance(self._setting_input, input_schema.NodeManualInput):
             if self.hash != self.calculate_hash(setting_input) or not self.node_stats.has_run:
-                self.function = self.function.__class__(setting_input.raw_data)
+                self.function = self.function.__class__(setting_input.raw_data_format)
                 self.reset()
                 self.get_predicted_schema()
         elif self._setting_input is not None:

flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import Any, Dict, List, Optional, TYPE_CHECKING, Union
 from pydantic import BaseModel, field_validator, ConfigDict
 import polars as pl
-from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars_str
+from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
 from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
 from flowfile_core.schemas.input_schema import MinimalFieldInfo
 from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
@@ -56,7 +56,7 @@ class JsonSchema(BaseModel):
                 dtype = 'string'
             else:
                 dtype = type_mapping.get(self.type[0] if isinstance(self.type, list) else self.type, 'string')
-        return type_to_polars_str(dtype)
+        return cast_str_to_polars_type(dtype)
 class AirbyteProperty(BaseModel):

flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py CHANGED Viewed

@@ -4,7 +4,7 @@ from flowfile_core.configs import logger
 from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
 from flowfile_core.schemas.input_schema import MinimalFieldInfo, DatabaseSettings
 from sqlalchemy import Engine, inspect, create_engine, text
-from flowfile_core.secrets.secrets import get_encrypted_secret, decrypt_secret
+from flowfile_core.secret_manager.secret_manager import get_encrypted_secret, decrypt_secret
 from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
 from flowfile_core.flowfile.sources.external_sources.sql_source.utils import get_polars_type, construct_sql_uri

flowfile_core/flowfile/utils.py CHANGED Viewed

@@ -1,9 +1,15 @@
-from typing import List
 import os
-import shutil
 import hashlib
-from datetime import datetime
 import json
+import polars as pl
+import shutil
+from datetime import datetime, date, time
+from typing import List
+from decimal import Decimal
+from flowfile_core.flowfile.flow_data_engine.utils import standardize_col_dtype
+from flowfile_core.schemas import input_schema
 def generate_sha256_hash(data: bytes):
@@ -26,8 +32,16 @@ def snake_case_to_camel_case(text: str) -> str:
 def json_default(val):
     if isinstance(val, datetime):
         return val.isoformat(timespec='microseconds')
+    elif isinstance(val, date):
+        return val.isoformat()
+    elif isinstance(val, time):
+        return val.isoformat()
     elif hasattr(val, '__dict__'):
         return val.__dict__
+    elif isinstance(val, Decimal):
+        if val.as_integer_ratio()[1] == 1:
+            return int(val)
+        return float(val)
     else:
         raise Exception('Value is not serializable')
@@ -104,3 +118,20 @@ def batch_generator(input_list: List, batch_size: int = 10000):
             input_list = []
             run = False
+def _handle_raw_data(node_manual_input: input_schema.NodeManualInput):
+    """Ensure compatibility with the new typed raw data and the old dict form data type"""
+    if (not (hasattr(node_manual_input, "raw_data_format") and node_manual_input.raw_data_format)
+            and (hasattr(node_manual_input, 'raw_data') and node_manual_input.raw_data)):
+        values = [standardize_col_dtype([vv for vv in c]) for c in zip(*(r.values()
+                                                                         for r in node_manual_input.raw_data))]
+        data_types = (pl.DataType.from_python(type(next((v for v in column_values), None))) for column_values in values)
+        _columns = [input_schema.MinimalFieldInfo(name=c, data_type=str(next(data_types))) for c in
+                    node_manual_input.raw_data[0].keys()]
+        node_manual_input.raw_data_format = input_schema.RawData(columns=_columns, data=values)
+    elif ((hasattr(node_manual_input, "raw_data_format") and node_manual_input.raw_data_format)
+          and not (hasattr(node_manual_input, 'raw_data') and node_manual_input.raw_data)):
+        node_manual_input.raw_data = [{c.name: node_manual_input.raw_data_format.data[ci][ri] for ci, c in
+                                       enumerate(node_manual_input.raw_data_format.columns)}
+                                      for ri in range(len(node_manual_input.raw_data_format.data[0]))]

flowfile_core/main.py CHANGED Viewed

@@ -8,7 +8,8 @@ from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from flowfile_core import ServerRun
-from flowfile_core.configs.settings import SERVER_HOST, SERVER_PORT, WORKER_HOST, WORKER_PORT, WORKER_URL
+from flowfile_core.configs.settings import (SERVER_HOST, SERVER_PORT, WORKER_HOST, WORKER_PORT, WORKER_URL,
+                                            OFFLOAD_TO_WORKER)
 from flowfile_core.routes.auth import router as auth_router
 from flowfile_core.routes.secrets import router as secrets_router
@@ -107,7 +108,6 @@ def run(host: str = None, port: int = None):
         host = SERVER_HOST
     if port is None:
         port = SERVER_PORT
     print(f"Starting server on {host}:{port}")
     print(f"Worker configured at {WORKER_URL} (host: {WORKER_HOST}, port: {WORKER_PORT})")
@@ -120,7 +120,6 @@ def run(host: str = None, port: int = None):
         host=host,
         port=port,
         loop="asyncio",
-        log_level="warning",
     )
     server = uvicorn.Server(config)
     server_instance = server  # Store server instance globally

flowfile_core/routes/secrets.py CHANGED Viewed

@@ -10,7 +10,7 @@ from flowfile_core.auth.jwt import get_current_active_user
 from flowfile_core.auth.models import Secret, SecretInput
 from flowfile_core.database import models as db_models
 from flowfile_core.database.connection import get_db
-from flowfile_core.secrets.secrets import encrypt_secret, store_secret, delete_secret as delete_secret_action
+from flowfile_core.secret_manager.secret_manager import encrypt_secret, store_secret, delete_secret as delete_secret_action
 router = APIRouter(dependencies=[Depends(get_current_active_user)])

flowfile_core/schemas/input_schema.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Optional, Literal
+from typing import List, Optional, Literal, Iterator
 from flowfile_core.schemas import transform_schema
 from pathlib import Path
 import os
@@ -61,7 +61,7 @@ class ReceivedTableBase(BaseModel):
             return self.path
     def set_absolute_filepath(self):
-        base_path = Path(self.path)
+        base_path = Path(self.path).expanduser()
         # Check if the path is relative, resolve it with the current working directory
         if not base_path.is_absolute():
             base_path = Path.cwd() / base_path
@@ -97,7 +97,7 @@ class ReceivedJsonTable(ReceivedCsvTable):
     pass
-class ReceivedParquetTable(BaseModel):
+class ReceivedParquetTable(ReceivedTableBase):
     file_type: str = 'parquet'
@@ -247,8 +247,14 @@ class NodeDatasource(NodeBase):
     file_ref: str = None
+class RawData(BaseModel):
+    columns: List[MinimalFieldInfo] = None
+    data: List[List]  # List of list where each inner list is a column of data. This ensures more efficient storage
 class NodeManualInput(NodeBase):
-    raw_data: List = None
+    raw_data: Optional[List] = None
+    raw_data_format: Optional[RawData] = None
 class NodeRead(NodeBase):

flowfile_core/schemas/transform_schema.py CHANGED Viewed

@@ -166,10 +166,8 @@ class FuzzyMap(JoinMap):
             self.output_column_name = f'fuzzy_score_{self.left_col}_{self.right_col}'
-@dataclass
-class CrossJoinInput:
-    left_select: SelectInputs = None
-    right_select: SelectInputs = None
+class JoinSelectMixin:
+    """Mixin for common join selection functionality"""
     @staticmethod
     def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> SelectInputs:
@@ -184,6 +182,26 @@ class CrossJoinInput:
         elif all(isinstance(c, str) for c in select):
             return SelectInputs([SelectInput(s, s) for s in select])
+    def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
+        current_names = self.left_select.new_cols & self.right_select.new_cols
+        if old_col_name not in current_names:
+            return old_col_name
+        while True:
+            if old_col_name not in current_names:
+                return old_col_name
+            old_col_name = f'{side}_{old_col_name}'
+    def add_new_select_column(self, select_input: SelectInput, side: str):
+        selects = self.right_select if side == 'right' else self.left_select
+        select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
+        selects.__add__(select_input)
+@dataclass
+class CrossJoinInput(JoinSelectMixin):
+    left_select: SelectInputs = None
+    right_select: SelectInputs = None
     def __init__(self, left_select: List[SelectInput] | List[str],
                  right_select: List[SelectInput] | List[str]):
         self.left_select = self.parse_select(left_select)
@@ -201,23 +219,9 @@ class CrossJoinInput:
                     right_col.new_name = 'right_' + right_col.new_name
             overlapping_records = self.overlapping_records
-    def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
-        current_names = self.left_select.new_cols & self.right_select.new_cols
-        if old_col_name not in current_names:
-            return old_col_name
-        while True:
-            if old_col_name not in current_names:
-                return old_col_name
-            old_col_name = f'{side}_{old_col_name}'
-    def add_new_select_column(self, select_input: SelectInput, side: str):
-        selects = self.right_select if side == 'right' else self.left_select
-        select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
-        selects.__add__(select_input)
 @dataclass
-class JoinInput:
+class JoinInput(JoinSelectMixin):
     join_mapping: List[JoinMap]
     left_select: SelectInputs = None
     right_select: SelectInputs = None
@@ -243,20 +247,8 @@ class JoinInput:
             raise Exception('No valid join mapping as input')
         return join_mapping
-    @staticmethod
-    def parse_select(select: List[SelectInput] | List[str] | List[Dict]) -> SelectInputs:
-        if all(isinstance(c, SelectInput) for c in select):
-            return SelectInputs(select)
-        elif all(isinstance(c, dict) for c in select):
-            return SelectInputs([SelectInput(**c) for c in select])
-        elif isinstance(select, dict):
-            renames = select.get('renames')
-            if renames:
-                return SelectInputs([SelectInput(**c) for c in renames])
-        elif all(isinstance(c, str) for c in select):
-            return SelectInputs([SelectInput(s, s) for s in select])
-    def __init__(self, join_mapping: List[JoinMap] | Tuple[str, str] | str, left_select: List[SelectInput] | List[str],
+    def __init__(self, join_mapping: List[JoinMap] | Tuple[str, str] | str,
+                 left_select: List[SelectInput] | List[str],
                  right_select: List[SelectInput] | List[str],
                  how: JoinStrategy = 'inner'):
         self.join_mapping = self.parse_join_mapping(join_mapping)
@@ -311,20 +303,6 @@ class JoinInput:
                                 )
         return new_mappings
-    def auto_generate_new_col_name(self, old_col_name: str, side: str) -> str:
-        current_names = self.left_select.new_cols & self.right_select.new_cols
-        if old_col_name not in current_names:
-            return old_col_name
-        while True:
-            if old_col_name not in current_names:
-                return old_col_name
-            old_col_name = f'{side}_{old_col_name}'
-    def add_new_select_column(self, select_input: SelectInput, side: str):
-        selects = self.right_select if side == 'right' else self.left_select
-        select_input.new_name = self.auto_generate_new_col_name(select_input.old_name, side=side)
-        selects.__add__(select_input)
 @dataclass
 class FuzzyMatchInput(JoinInput):

flowfile_frame/__init__.py CHANGED Viewed

@@ -1,6 +1,10 @@
 # flowframe/__init__.py
 """A Polars-like API for building ETL graphs."""
+from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
+OFFLOAD_TO_WORKER.value = False
 # Core classes
 from flowfile_frame.flow_frame import FlowFrame   # noqa: F401
@@ -10,9 +14,11 @@ from flowfile_frame.utils import create_flow_graph  # noqa: F401
 from flowfile_frame.expr import (  # noqa: F401
     col, lit, column,
     cum_count, len,
-    sum, min, max, mean, count, when
+    sum, min, max, mean, count, when, implode, last, corr, cov, first
 )
+from flowfile_frame.lazy import (fold)
 # Selector utilities
 from flowfile_frame.selectors import (  # noqa: F401
     numeric, float_, integer, string, temporal,
@@ -21,10 +27,11 @@ from flowfile_frame.selectors import (  # noqa: F401
     by_dtype, contains, starts_with, ends_with, matches
 )
+from flowfile_frame.series import Series
 # File I/O
-from flowfile_frame.flow_frame import (  # noqa: F401
-    read_csv, read_parquet, from_dict, concat, scan_csv, scan_parquet
-)
+from flowfile_frame.flow_frame_methods import (  # noqa: F401
+    read_csv, read_parquet, from_dict, concat,  scan_csv, scan_parquet)
 from polars.datatypes import (  # noqa: F401
     # Integer types

Flowfile 0.3.2__py3-none-any.whl → 0.3.3.1__py3-none-any.whl

Flowfile 0.3.2py3-none-any.whl → 0.3.3.1py3-none-any.whl