PyPI - Flowfile - Versions diffs - 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

Flowfile 0.4.1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

flowfile_core/schemas/yaml_types.py ADDED Viewed

@@ -0,0 +1,103 @@
+from typing import TypedDict, List
+# === Transform Schema YAML Types ===
+class SelectInputYaml(TypedDict, total=False):
+    old_name: str
+    new_name: str
+    keep: bool
+    data_type: str
+class JoinInputsYaml(TypedDict):
+    select: List[SelectInputYaml]
+class JoinMapYaml(TypedDict):
+    left_col: str
+    right_col: str
+class JoinInputYaml(TypedDict):
+    join_mapping: List[JoinMapYaml]
+    left_select: JoinInputsYaml
+    right_select: JoinInputsYaml
+    how: str
+class CrossJoinInputYaml(TypedDict):
+    left_select: JoinInputsYaml
+    right_select: JoinInputsYaml
+class FuzzyMappingYaml(TypedDict, total=False):
+    left_col: str
+    right_col: str
+    threshold_score: float
+    fuzzy_type: str
+    perc_unique: float
+    output_column_name: str
+    valid: bool
+class FuzzyMatchInputYaml(TypedDict):
+    join_mapping: List[FuzzyMappingYaml]
+    left_select: JoinInputsYaml
+    right_select: JoinInputsYaml
+    how: str
+    aggregate_output: bool
+# === Input Schema YAML Types ===
+class OutputSettingsYaml(TypedDict, total=False):
+    name: str
+    directory: str
+    file_type: str
+    write_mode: str
+    abs_file_path: str
+    fields: List[str]
+    table_settings: dict
+class NodeSelectYaml(TypedDict):
+    cache_results: bool
+    keep_missing: bool
+    select_input: List[SelectInputYaml]
+    sorted_by: str
+class NodeJoinYaml(TypedDict):
+    cache_results: bool
+    auto_generate_selection: bool
+    verify_integrity: bool
+    join_input: JoinInputYaml
+    auto_keep_all: bool
+    auto_keep_right: bool
+    auto_keep_left: bool
+class NodeCrossJoinYaml(TypedDict):
+    cache_results: bool
+    auto_generate_selection: bool
+    verify_integrity: bool
+    cross_join_input: CrossJoinInputYaml
+    auto_keep_all: bool
+    auto_keep_right: bool
+    auto_keep_left: bool
+class NodeFuzzyMatchYaml(TypedDict):
+    cache_results: bool
+    auto_generate_selection: bool
+    verify_integrity: bool
+    join_input: FuzzyMatchInputYaml
+    auto_keep_all: bool
+    auto_keep_right: bool
+    auto_keep_left: bool
+class NodeOutputYaml(TypedDict):
+    cache_results: bool
+    output_settings: OutputSettingsYaml

flowfile_core/{flowfile/node_designer/data_types.py → types.py} RENAMED Viewed

@@ -18,10 +18,20 @@ Usage:
 """
 from enum import Enum
-from typing import List, Union
+from typing import List, Literal, Union
 import polars as pl
+DataTypeStr = Literal[
+    "Int8", "Int16", "Int32", "Int64",
+    "UInt8", "UInt16", "UInt32", "UInt64",
+    "Float32", "Float64", "Decimal",
+    "String",
+    "Date", "Datetime", "Time", "Duration",
+    "Boolean", "Binary", "List", "Struct", "Array", "Integer", "Double", "Utf8"
+]
 class TypeGroup(str, Enum):
     """High-level type groups for column selection."""
     Numeric = "Numeric"

flowfile_frame/__init__.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # flowframe/__init__.py
 """A Polars-like API for building ETL graphs."""
+from importlib.metadata import version
 # Core classes
 from flowfile_frame.flow_frame import FlowFrame   # noqa: F401
 from pl_fuzzy_frame_match.models import FuzzyMapping  # noqa: F401
@@ -64,4 +66,4 @@ from polars.datatypes import (  # noqa: F401
     DataType, DataTypeClass, Field
 )
-__version__ = "0.1.0"
+__version__ = version("Flowfile")

flowfile_frame/flow_frame.py CHANGED Viewed

@@ -10,7 +10,7 @@ from flowfile_frame.lazy_methods import add_lazyframe_methods
 from polars._typing import (CsvEncoding, FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
 from collections.abc import Iterator
-from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
+from pl_fuzzy_frame_match import FuzzyMapping
 from flowfile_core.flowfile.flow_graph import FlowGraph, add_connection
 from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
@@ -626,7 +626,6 @@ class FlowFrame:
         left_columns, right_columns = self._parse_join_columns(
             on, left_on, right_on, how
         )
         # Step 5: Validate column lists have same length (except for cross join)
         if how != 'cross' and left_columns is not None and right_columns is not None:
             if len(left_columns) != len(right_columns):
@@ -798,33 +797,36 @@ class FlowFrame:
     ) -> "FlowFrame":
         """Execute join using native FlowFile join nodes."""
         # Create select inputs for both frames
         left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
         right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
         # Create appropriate join input based on join type
         if how == 'cross':
             join_input = transform_schema.CrossJoinInput(
-                left_select=left_select.renames,
+                left_select=transform_schema.JoinInputs(renames=left_select.renames),
                 right_select=right_select.renames,
             )
+            join_input_manager = transform_schema.CrossJoinInputManager(join_input)
         else:
             join_input = transform_schema.JoinInput(
                 join_mapping=join_mappings,
-                left_select=left_select.renames,
+                left_select=transform_schema.JoinInputs(renames=left_select.renames),
                 right_select=right_select.renames,
                 how=how,
             )
+            join_input_manager = transform_schema.JoinInputManager(join_input)
         # Configure join input
-        join_input.auto_rename()
-        for right_column in right_select.renames:
+        for right_column in join_input_manager.right_select.renames:
             if right_column.join_key:
                 right_column.keep = False
         # Create and add appropriate node
         if how == 'cross':
-            self._add_cross_join_node(new_node_id, join_input, description, other)
+            self._add_cross_join_node(new_node_id, join_input_manager.to_cross_join_input(), description, other)
         else:
-            self._add_regular_join_node(new_node_id, join_input, description, other)
+            self._add_regular_join_node(new_node_id, join_input_manager.to_join_input(), description, other)
         # Add connections
         self._add_connection(self.node_id, new_node_id, "main")
@@ -1140,16 +1142,11 @@ class FlowFrame:
         file_name = file_str.split(os.sep)[-1]
         use_polars_code = bool(kwargs.items()) or not is_path_input
-        output_parquet_table = input_schema.OutputParquetTable(
-            file_type="parquet"
-        )
         output_settings = input_schema.OutputSettings(
             file_type='parquet',
             name=file_name,
             directory=file_str if is_path_input else str(file_str),
-            output_parquet_table=output_parquet_table,
-            output_csv_table=input_schema.OutputCsvTable(),
-            output_excel_table=input_schema.OutputExcelTable()
+            table_settings=input_schema.OutputParquetTable()
         )
         if is_path_input:
@@ -1220,10 +1217,10 @@ class FlowFrame:
             file_type='csv',
             name=file_name,
             directory=file_str if is_path_input else str(file_str),
-            output_csv_table=input_schema.OutputCsvTable(
-                file_type="csv", delimiter=separator, encoding=encoding),
-            output_excel_table=input_schema.OutputExcelTable(),
-            output_parquet_table=input_schema.OutputParquetTable()
+            table_settings=input_schema.OutputCsvTable(
+                delimiter=separator,
+                encoding=encoding
+            )
         )
         if is_path_input:
             try:

flowfile_frame/flow_frame_methods.py CHANGED Viewed

@@ -186,15 +186,17 @@ def read_csv(
             file_type='csv',
             path=current_source_path_for_native,
             name=Path(current_source_path_for_native).name,
-            delimiter=separator,
-            has_headers=has_header,
-            encoding=encoding,
-            starting_from_line=skip_rows,
-            quote_char=quote_char if quote_char is not None else '"',
-            infer_schema_length=actual_infer_schema_length if actual_infer_schema_length is not None else 10000,
-            truncate_ragged_lines=truncate_ragged_lines,
-            ignore_errors=ignore_errors,
-            row_delimiter=eol_char
+            table_settings=input_schema.InputCsvTable(
+                delimiter=separator,
+                has_headers=has_header,
+                encoding=encoding,
+                starting_from_line=skip_rows,
+                quote_char=quote_char if quote_char is not None else '"',
+                infer_schema_length=actual_infer_schema_length if actual_infer_schema_length is not None else 10000,
+                truncate_ragged_lines=truncate_ragged_lines,
+                ignore_errors=ignore_errors,
+                row_delimiter=eol_char
+            )
         )
         if convert_to_absolute_path:
             try:
@@ -407,6 +409,7 @@ def read_parquet(source, *, flow_graph: FlowGraph = None, description: str = Non
         file_type='parquet',
         path=source,
         name=Path(source).name,
+        table_settings=input_schema.InputParquetTable()
     )
     if convert_to_absolute_path:
         received_table.path = received_table.abs_file_path

flowfile_worker/__init__.py CHANGED Viewed

@@ -2,6 +2,9 @@ from typing import Dict
 import threading
 import multiprocessing
 from shared.storage_config import storage
+from importlib.metadata import version
+__version__ = version("Flowfile")
 multiprocessing.set_start_method('spawn', force=True)

flowfile_worker/create/__init__.py CHANGED Viewed

@@ -1,29 +1,11 @@
-from flowfile_worker.create.models import (ReceivedCsvTable, ReceivedParquetTable, ReceivedExcelTable,
-                                           ReceivedJsonTable)
 from flowfile_worker.create.funcs import (create_from_path_csv, create_from_path_parquet, create_from_path_excel,
                                           create_from_path_json)
-from typing import Dict, Literal
+from typing import Literal
-ReceivedTableCollection = ReceivedCsvTable | ReceivedParquetTable | ReceivedJsonTable | ReceivedExcelTable
 FileType = Literal['csv', 'parquet', 'json', 'excel']
-def received_table_parser(received_table_raw: Dict, file_type: FileType) -> ReceivedTableCollection:
-    match file_type:
-        case 'csv':
-            received_table = ReceivedCsvTable.model_validate(received_table_raw)
-        case 'parquet':
-            received_table = ReceivedParquetTable.model_validate(received_table_raw)
-        case 'excel':
-            received_table = ReceivedExcelTable.model_validate(received_table_raw)
-        case 'json':
-            return ReceivedJsonTable.model_validate(received_table_raw)
-        case _:
-            raise ValueError(f'Unsupported file type: {file_type}')
-    return received_table
-def table_creator_factory_method(file_type: Literal['csv', 'parquet', 'json', 'excel']) -> callable:
+def table_creator_factory_method(file_type: FileType) -> callable:
     match file_type:
         case 'csv':
             return create_from_path_csv

flowfile_worker/create/funcs.py CHANGED Viewed

@@ -1,94 +1,100 @@
 import polars as pl
 import os
-from flowfile_worker.create.models import ReceivedCsvTable, ReceivedParquetTable, ReceivedExcelTable
+from flowfile_worker.create.models import ReceivedTable, InputCsvTable, InputJsonTable, InputExcelTable, InputParquetTable
 from flowfile_worker.create.utils import create_fake_data
 from flowfile_worker.create.read_excel_tables import df_from_openpyxl, df_from_calamine_xlsx
-def create_from_path_json(received_table: ReceivedCsvTable):
+def create_from_path_json(received_table: ReceivedTable):
+    if not isinstance(received_table.table_settings, InputJsonTable):
+        raise ValueError("Received table settings are not of type InputJsonTable")
+    input_table_settings: InputJsonTable = received_table.table_settings
     f = received_table.abs_file_path
     gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
     low_mem = gbs_to_load > 10
-    if received_table.encoding.upper() == 'UTF8' or received_table.encoding.upper() == 'UTF-8':
+    if input_table_settings.encoding.upper() == 'UTF8' or input_table_settings.encoding.upper() == 'UTF-8':
         try:
             df = pl.scan_csv(f,
                                low_memory=low_mem,
                                try_parse_dates=True,
-                               separator=received_table.delimiter,
-                               has_header=received_table.has_headers,
-                               skip_rows=received_table.starting_from_line,
+                               separator=input_table_settings.delimiter,
+                               has_header=input_table_settings.has_headers,
+                               skip_rows=input_table_settings.starting_from_line,
                                encoding='utf8',
-                               infer_schema_length=received_table.infer_schema_length)
+                               infer_schema_length=input_table_settings.infer_schema_length)
             df.head(1).collect()
             return df
         except:
             try:
                 df = pl.scan_csv(f, low_memory=low_mem,
-                                   separator=received_table.delimiter,
-                                   has_header=received_table.has_headers,
-                                   skip_rows=received_table.starting_from_line,
+                                   separator=input_table_settings.delimiter,
+                                   has_header=input_table_settings.has_headers,
+                                   skip_rows=input_table_settings.starting_from_line,
                                    encoding='utf8-lossy',
                                    ignore_errors=True)
                 return df
             except:
                 df = pl.scan_csv(f, low_memory=low_mem,
-                                   separator=received_table.delimiter,
-                                   has_header=received_table.has_headers,
-                                   skip_rows=received_table.starting_from_line,
+                                   separator=input_table_settings.delimiter,
+                                   has_header=input_table_settings.has_headers,
+                                   skip_rows=input_table_settings.starting_from_line,
                                    encoding='utf8',
                                    ignore_errors=True)
                 return df
     else:
         df = pl.read_csv(f, low_memory=low_mem,
-                           separator=received_table.delimiter,
-                           has_header=received_table.has_headers,
-                           skip_rows=received_table.starting_from_line,
-                           encoding=received_table.encoding,
+                           separator=input_table_settings.delimiter,
+                           has_header=input_table_settings.has_headers,
+                           skip_rows=input_table_settings.starting_from_line,
+                           encoding=input_table_settings.encoding,
                            ignore_errors=True)
         return df
-def create_from_path_csv(received_table: ReceivedCsvTable) -> pl.DataFrame:
+def create_from_path_csv(received_table: ReceivedTable) -> pl.DataFrame:
     f = received_table.abs_file_path
+    if not isinstance(received_table.table_settings, InputCsvTable):
+        raise ValueError("Received table settings are not of type InputCsvTable")
+    input_table_settings: InputCsvTable = received_table.table_settings
     gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
     low_mem = gbs_to_load > 10
-    if received_table.encoding.upper() == 'UTF8' or received_table.encoding.upper() == 'UTF-8':
+    if input_table_settings.encoding.upper() == 'UTF8' or input_table_settings.encoding.upper() == 'UTF-8':
         try:
             df = pl.scan_csv(f,
                                low_memory=low_mem,
                                try_parse_dates=True,
-                               separator=received_table.delimiter,
-                               has_header=received_table.has_headers,
-                               skip_rows=received_table.starting_from_line,
+                               separator=input_table_settings.delimiter,
+                               has_header=input_table_settings.has_headers,
+                               skip_rows=input_table_settings.starting_from_line,
                                encoding='utf8',
-                               infer_schema_length=received_table.infer_schema_length)
+                               infer_schema_length=input_table_settings.infer_schema_length)
             df.head(1).collect()
             return df
         except:
             try:
                 df = pl.scan_csv(f, low_memory=low_mem,
-                                   separator=received_table.delimiter,
-                                   has_header=received_table.has_headers,
-                                   skip_rows=received_table.starting_from_line,
+                                   separator=input_table_settings.delimiter,
+                                   has_header=input_table_settings.has_headers,
+                                   skip_rows=input_table_settings.starting_from_line,
                                    encoding='utf8-lossy',
                                    ignore_errors=True)
                 return df
             except:
                 df = pl.scan_csv(f, low_memory=low_mem,
-                                   separator=received_table.delimiter,
-                                   has_header=received_table.has_headers,
-                                   skip_rows=received_table.starting_from_line,
+                                   separator=input_table_settings.delimiter,
+                                   has_header=input_table_settings.has_headers,
+                                   skip_rows=input_table_settings.starting_from_line,
                                    encoding='utf8',
                                    ignore_errors=True)
                 return df
     else:
         df = pl.read_csv(f,
                            low_memory=low_mem,
-                           separator=received_table.delimiter,
-                           has_header=received_table.has_headers,
-                           skip_rows=received_table.starting_from_line,
-                           encoding=received_table.encoding,
+                           separator=input_table_settings.delimiter,
+                           has_header=input_table_settings.has_headers,
+                           skip_rows=input_table_settings.starting_from_line,
+                           encoding=input_table_settings.encoding,
                            ignore_errors=True)
         return df
@@ -97,50 +103,56 @@ def create_random(number_of_records: int = 1000) -> pl.LazyFrame:
     return create_fake_data(number_of_records).lazy()
-def create_from_path_parquet(received_table: ReceivedParquetTable):
+def create_from_path_parquet(received_table: ReceivedTable):
+    if not isinstance(received_table.table_settings, InputParquetTable):
+        raise ValueError("Received table settings are not of type InputParquetTable")
     low_mem = (os.path.getsize(received_table.abs_file_path) / 1024 / 1000 / 1000) > 2
     return pl.scan_parquet(source=received_table.abs_file_path, low_memory=low_mem)
-def create_from_path_excel(received_table: ReceivedExcelTable):
-    if received_table.type_inference:
+def create_from_path_excel(received_table: ReceivedTable):
+    if not isinstance(received_table.table_settings, InputExcelTable):
+        raise ValueError("Received table settings are not of type InputExcelTable")
+    input_table_settings: InputExcelTable = received_table.table_settings
+    if input_table_settings.type_inference:
         engine = 'openpyxl'
-    elif received_table.start_row > 0 and received_table.start_column == 0:
-        engine = 'calamine' if received_table.has_headers else 'xlsx2csv'
-    elif received_table.start_column > 0 or received_table.start_row > 0:
+    elif input_table_settings.start_row > 0 and input_table_settings.start_column == 0:
+        engine = 'calamine' if input_table_settings.has_headers else 'xlsx2csv'
+    elif input_table_settings.start_column > 0 or input_table_settings.start_row > 0:
         engine = 'openpyxl'
     else:
         engine = 'calamine'
-    sheet_name = received_table.sheet_name
+    sheet_name = input_table_settings.sheet_name
     if engine == 'calamine':
         df = df_from_calamine_xlsx(file_path=received_table.abs_file_path, sheet_name=sheet_name,
-                                   start_row=received_table.start_row, end_row=received_table.end_row)
-        if received_table.end_column > 0:
-            end_col_index = received_table.end_column
-            cols_to_select = [df.columns[i] for i in range(received_table.start_column, end_col_index)]
+                                   start_row=input_table_settings.start_row, end_row=input_table_settings.end_row)
+        if input_table_settings.end_column > 0:
+            end_col_index = input_table_settings.end_column
+            cols_to_select = [df.columns[i] for i in range(input_table_settings.start_column, end_col_index)]
             df = df.select(cols_to_select)
     elif engine == 'xlsx2csv':
-        csv_options = {'has_header': received_table.has_headers, 'skip_rows': received_table.start_row}
+        csv_options = {'has_header': input_table_settings.has_headers, 'skip_rows': input_table_settings.start_row}
         df = pl.read_excel(source=received_table.abs_file_path,
                            read_options=csv_options,
                            engine='xlsx2csv',
-                           sheet_name=received_table.sheet_name)
-        end_col_index = received_table.end_column if received_table.end_column > 0 else len(df.columns)
-        cols_to_select = [df.columns[i] for i in range(received_table.start_column, end_col_index)]
+                           sheet_name=input_table_settings.sheet_name)
+        end_col_index = input_table_settings.end_column if input_table_settings.end_column > 0 else len(df.columns)
+        cols_to_select = [df.columns[i] for i in range(input_table_settings.start_column, end_col_index)]
         df = df.select(cols_to_select)
-        if 0 < received_table.end_row < len(df):
-            df = df.head(received_table.end_row)
+        if 0 < input_table_settings.end_row < len(df):
+            df = df.head(input_table_settings.end_row)
     else:
-        max_col = received_table.end_column if received_table.end_column > 0 else None
-        max_row = received_table.end_row + 1 if received_table.end_row > 0 else None
+        max_col = input_table_settings.end_column if input_table_settings.end_column > 0 else None
+        max_row = input_table_settings.end_row + 1 if input_table_settings.end_row > 0 else None
         df = df_from_openpyxl(file_path=received_table.abs_file_path,
-                              sheet_name=received_table.sheet_name,
-                              min_row=received_table.start_row + 1,
-                              min_col=received_table.start_column + 1,
+                              sheet_name=input_table_settings.sheet_name,
+                              min_row=input_table_settings.start_row + 1,
+                              min_col=input_table_settings.start_column + 1,
                               max_row=max_row,
-                              max_col=max_col, has_headers=received_table.has_headers)
+                              max_col=max_col, has_headers=input_table_settings.has_headers)
     return df

Flowfile 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

Flowfile 0.4.1py3-none-any.whl → 0.5.1py3-none-any.whl