PyPI - Flowfile - Versions diffs - 0.3.1.2__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

Flowfile 0.3.1.2py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

flowfile_frame/flow_frame.py CHANGED Viewed

@@ -1,14 +1,17 @@
-import uuid
+import logging
 import os
-from typing import Any, Iterable, List, Literal, Optional, Tuple, Union
+from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, Callable
 from pathlib import Path
+import io
 import re
 import polars as pl
-from polars._typing import FrameInitTypes, SchemaDefinition, SchemaDict, Orientation
+from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation, IO, Mapping, PolarsDataType,
+                            Sequence, CsvEncoding)
 # Assume these imports are correct from your original context
 from flowfile_core.flowfile.FlowfileFlow import FlowGraph, add_connection
+from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
 from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
 from flowfile_core.flowfile.flow_node.flow_node import FlowNode
 from flowfile_core.schemas import input_schema, transform_schema
@@ -22,6 +25,14 @@ from flowfile_frame.join import _normalize_columns_to_list, _create_join_mapping
 node_id_counter = 0
+logging.basicConfig(
+    level=logging.INFO,
+    format='[%(levelname)s] %(message)s'
+)
+# Create and export the logger
+logger = logging.getLogger('flow_frame')
 def _to_string_val(v) -> str:
     if isinstance(v, str):
         return f"'{v}'"
@@ -478,14 +489,25 @@ class FlowFrame:
         FlowFrame
             New FlowFrame with join operation applied.
         """
-        new_node_id = generate_node_id()
-        print('new node id', new_node_id)
         use_polars_code = not(maintain_order is None and
                               coalesce is None and
                               nulls_equal is False and
                               validate is None and
                               suffix == '_right')
         join_mappings = None
+        if self.flow_graph.flow_id != other.flow_graph.flow_id:
+            combined_graph, node_mappings = combine_flow_graphs_with_mapping(self.flow_graph, other.flow_graph)
+            new_self_node_id = node_mappings.get((self.flow_graph.flow_id, self.node_id), None)
+            new_other_node_id = node_mappings.get((other.flow_graph.flow_id, other.node_id), None)
+            if new_other_node_id is None or new_self_node_id is None:
+                raise ValueError("Cannot remap the nodes")
+            self.node_id = new_self_node_id
+            other.node_id = new_other_node_id
+            self.flow_graph = combined_graph
+            other.flow_graph = combined_graph
+            global node_id_counter
+            node_id_counter += len(combined_graph.nodes)
+        new_node_id = generate_node_id()
         if on is not None:
             left_columns = right_columns = _normalize_columns_to_list(on)
         elif left_on is not None and right_on is not None:
@@ -597,10 +619,8 @@ class FlowFrame:
         if (len(columns) == 1 and isinstance(columns[0], Expr)
                 and str(columns[0]) == "pl.Expr(len()).alias('number_of_records')"):
             return self._add_number_of_records(new_node_id, description)
-        # Handle simple column names
         if all(isinstance(col_, (str, Column)) for col_ in columns):
-            # Create select inputs
             select_inputs = [
                 transform_schema.SelectInput(old_name=col_) if isinstance(col_, str) else col_.to_select_input()
                 for col_ in columns
@@ -946,7 +966,7 @@ class FlowFrame:
             input_schema.NodeFormula(flow_id=self.flow_graph.flow_id, node_id=new_node_id, depending_on_id=self.node_id,
                                      function=transform_schema.FunctionInput(
                                          function=flowfile_formula,
-                                         field=transform_schema.FieldInput(name=output_column_name)),
+                                         field=transform_schema.FieldInput(name=output_column_name, data_type='Auto')),
                                      description=description))
         self.flow_graph.add_formula(function_settings)
         return self._create_child_frame(new_node_id)
@@ -1241,14 +1261,24 @@ class FlowFrame:
         FlowFrame
             A new FlowFrame with the concatenated data
         """
-        new_node_id = generate_node_id()
         # Convert single FlowFrame to list
         if isinstance(other, FlowFrame):
             others = [other]
         else:
             others = other
+        all_graphs = []
+        all_graph_ids = []
+        for g in [self.flow_graph] + [f.flow_graph for f in others]:
+            if g.flow_id not in all_graph_ids:
+                all_graph_ids.append(g.flow_id)
+                all_graphs.append(g)
+        if len(all_graphs) > 1:
+            combined_graph, node_mappings = combine_flow_graphs_with_mapping(*all_graphs)
+            for f in [self] + other:
+                f.node_id = node_mappings.get((f.flow_graph.flow_id, f.node_id), None)
+            global node_id_counter
+            node_id_counter += len(combined_graph.nodes)
+        new_node_id = generate_node_id()
         use_native = how == "diagonal_relaxed" and parallel and not rechunk
         if use_native:
@@ -1902,64 +1932,328 @@ def count(expr):
     return expr.count()
-def read_csv(file_path, *, flow_graph: FlowGraph = None, separator: str = ';',
-             convert_to_absolute_path: bool = True,
-             description: str = None, **options):
+def read_csv(
+        source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
+        *,
+        flow_graph: Optional[Any] = None, # Using Any for FlowGraph placeholder
+        separator: str = ',',
+        convert_to_absolute_path: bool = True,
+        description: Optional[str] = None,
+        has_header: bool = True,
+        new_columns: Optional[List[str]] = None,
+        comment_prefix: Optional[str] = None,
+        quote_char: Optional[str] = '"',
+        skip_rows: int = 0,
+        skip_lines: int = 0,
+        schema: Optional[SchemaDict] = None,
+        schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
+        null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
+        missing_utf8_is_empty_string: bool = False,
+        ignore_errors: bool = False,
+        try_parse_dates: bool = False,
+        infer_schema: bool = True,
+        infer_schema_length: Optional[int] = 100,
+        n_rows: Optional[int] = None,
+        encoding: CsvEncoding = 'utf8',
+        low_memory: bool = False,
+        rechunk: bool = False,
+        storage_options: Optional[Dict[str, Any]] = None,
+        skip_rows_after_header: int = 0,
+        row_index_name: Optional[str] = None,
+        row_index_offset: int = 0,
+        eol_char: str = '\n',
+        raise_if_empty: bool = True,
+        truncate_ragged_lines: bool = False,
+        decimal_comma: bool = False,
+        glob: bool = True,
+        cache: bool = True,
+        with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
+        **other_options: Any
+) -> FlowFrame:
     """
     Read a CSV file into a FlowFrame.
+    This function uses the native FlowGraph implementation when the parameters
+    fall within the supported range, and falls back to using Polars' scan_csv implementation
+    for more advanced features.
     Args:
-        file_path: Path to CSV file
+        source: Path(s) to CSV file(s), or a file-like object.
         flow_graph: if you want to add it to an existing graph
         separator: Single byte character to use as separator in the file.
         convert_to_absolute_path: If the path needs to be set to a fixed location
         description: if you want to add a readable name in the frontend (advised)
-        **options: Options for polars.read_csv
+        # Polars.scan_csv aligned parameters
+        has_header: Indicate if the first row of the dataset is a header or not.
+        new_columns: Rename columns after selection.
+        comment_prefix: String that indicates a comment line if found at beginning of line.
+        quote_char: Character used for quoting. None to disable.
+        skip_rows: Start reading after this many rows.
+        skip_lines: Skip this many lines by newline char only.
+        schema: Schema to use when reading the CSV.
+        schema_overrides: Schema overrides for specific columns.
+        null_values: Values to interpret as null.
+        missing_utf8_is_empty_string: Treat missing utf8 values as empty strings.
+        ignore_errors: Try to keep reading lines if some parsing errors occur.
+        try_parse_dates: Try to automatically parse dates.
+        infer_schema: Boolean flag. If False, `infer_schema_length` for Polars is set to 0.
+        infer_schema_length: Number of rows to use for schema inference. Polars default is 100.
+        n_rows: Stop reading after this many rows.
+        encoding: Character encoding to use.
+        low_memory: Reduce memory usage at the cost of performance.
+        rechunk: Ensure data is in contiguous memory layout after parsing.
+        storage_options: Options for fsspec for cloud storage.
+        skip_rows_after_header: Skip rows after header.
+        row_index_name: Name of the row index column.
+        row_index_offset: Start value for the row index.
+        eol_char: End of line character.
+        raise_if_empty: Raise error if file is empty.
+        truncate_ragged_lines: Truncate lines with too many values.
+        decimal_comma: Parse floats with decimal comma.
+        glob: Use glob pattern for file path (if source is a string).
+        cache: Cache the result after reading (Polars default True).
+        with_column_names: Apply a function over the column names.
+        other_options: Any other options to pass to polars.scan_csv (e.g. retries, file_cache_ttl).
     Returns:
-        A FlowFrame with the CSV data
+        A FlowFrame with the CSV data.
     """
-    # Create new node ID
-    node_id = generate_node_id()
+    node_id = generate_node_id() # Assuming generate_node_id is defined
     if flow_graph is None:
-        flow_graph = create_flow_graph()
+        flow_graph = create_flow_graph() # Assuming create_flow_graph is defined
     flow_id = flow_graph.flow_id
-    has_headers = options.get('has_header', True)
-    encoding = options.get('encoding', 'utf-8')
-    if '~' in file_path:
-        file_path = os.path.expanduser(file_path)
-    received_table = input_schema.ReceivedTable(
-        file_type='csv',
-        path=file_path,
-        name=Path(file_path).name,
-        delimiter=separator,
-        has_headers=has_headers,
-        encoding=encoding
+    current_source_path_for_native = None
+    if isinstance(source, (str, os.PathLike)):
+        current_source_path_for_native = str(source)
+        if '~' in current_source_path_for_native:
+            current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
+    elif isinstance(source, list) and all(isinstance(s, (str, os.PathLike)) for s in source):
+        current_source_path_for_native = str(source[0]) if source else None
+        if current_source_path_for_native and '~' in current_source_path_for_native:
+             current_source_path_for_native = os.path.expanduser(current_source_path_for_native)
+    elif isinstance(source, (io.BytesIO, io.StringIO)):
+        logger.warning("Read from bytes io from csv not supported, converting data to raw data")
+        return from_dict(pl.read_csv(source), flow_graph=flow_graph, description=description)
+    actual_infer_schema_length: Optional[int]
+    if not infer_schema:
+        actual_infer_schema_length = 0
+    else:
+        actual_infer_schema_length = infer_schema_length
+    can_use_native = (
+            current_source_path_for_native is not None and
+            comment_prefix is None and
+            skip_lines == 0 and
+            schema is None and
+            schema_overrides is None and
+            null_values is None and
+            not missing_utf8_is_empty_string and
+            not try_parse_dates and
+            n_rows is None and
+            not low_memory and
+            not rechunk and
+            storage_options is None and
+            skip_rows_after_header == 0 and
+            row_index_name is None and
+            row_index_offset == 0 and
+            eol_char == '\n' and
+            not decimal_comma and
+            new_columns is None and
+            glob is True
     )
+    if can_use_native and current_source_path_for_native:
+        received_table = input_schema.ReceivedTable(
+            file_type='csv',
+            path=current_source_path_for_native,
+            name=Path(current_source_path_for_native).name,
+            delimiter=separator,
+            has_headers=has_header,
+            encoding=encoding,
+            starting_from_line=skip_rows,
+            quote_char=quote_char if quote_char is not None else '"',
+            infer_schema_length=actual_infer_schema_length if actual_infer_schema_length is not None else 10000,
+            truncate_ragged_lines=truncate_ragged_lines,
+            ignore_errors=ignore_errors,
+            row_delimiter=eol_char
+        )
+        if convert_to_absolute_path:
+            try:
+                received_table.set_absolute_filepath()
+                received_table.path = received_table.abs_file_path
+            except Exception as e:
+                print(f"Warning: Could not determine absolute path for {current_source_path_for_native}: {e}")
-    if convert_to_absolute_path:
-        received_table.path = received_table.abs_file_path
+        read_node_description = description or f"Read CSV from {Path(current_source_path_for_native).name}"
+        read_node = input_schema.NodeRead(
+            flow_id=flow_id,
+            node_id=node_id,
+            received_file=received_table,
+            pos_x=100,
+            pos_y=100,
+            is_setup=True,
+            description=read_node_description
+        )
+        flow_graph.add_read(read_node)
+        result_frame = FlowFrame(
+            data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
+            flow_graph=flow_graph,
+            node_id=node_id
+        )
+        return result_frame
+    else:
+        polars_source_arg = source
+        polars_code = _build_polars_code_args(
+            source=polars_source_arg,
+            separator=separator,
+            has_header=has_header,
+            new_columns=new_columns,
+            comment_prefix=comment_prefix,
+            quote_char=quote_char,
+            skip_rows=skip_rows,
+            skip_lines=skip_lines,
+            schema=schema,
+            schema_overrides=schema_overrides,
+            null_values=null_values,
+            missing_utf8_is_empty_string=missing_utf8_is_empty_string,
+            ignore_errors=ignore_errors,
+            try_parse_dates=try_parse_dates,
+            infer_schema_length=actual_infer_schema_length,
+            n_rows=n_rows,
+            encoding=encoding,
+            low_memory=low_memory,
+            rechunk=rechunk,
+            storage_options=storage_options,
+            skip_rows_after_header=skip_rows_after_header,
+            row_index_name=row_index_name,
+            row_index_offset=row_index_offset,
+            eol_char=eol_char,
+            raise_if_empty=raise_if_empty,
+            truncate_ragged_lines=truncate_ragged_lines,
+            decimal_comma=decimal_comma,
+            glob=glob,
+            cache=cache,
+            with_column_names=with_column_names,
+            **other_options
+        )
+        polars_code_node_description = description or "Read CSV with Polars scan_csv"
+        if isinstance(source, (str, os.PathLike)):
+            polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source).name}"
+        elif isinstance(source, list) and source and isinstance(source[0], (str, os.PathLike)):
+            polars_code_node_description = description or f"Read CSV with Polars scan_csv from {Path(source[0]).name} (and possibly others)"
-    read_node = input_schema.NodeRead(
-        flow_id=flow_id,
-        node_id=node_id,
-        received_file=received_table,
-        pos_x=100,
-        pos_y=100,
-        is_setup=True
-    )
+        # Assuming input_schema.NodePolarsCode, transform_schema.PolarsCodeInput are defined
+        polars_code_settings = input_schema.NodePolarsCode(
+            flow_id=flow_id,
+            node_id=node_id,
+            polars_code_input=transform_schema.PolarsCodeInput(polars_code=polars_code),
+            is_setup=True,
+            description=polars_code_node_description
+        )
+        flow_graph.add_polars_code(polars_code_settings)
+        return FlowFrame(
+            data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
+            flow_graph=flow_graph,
+            node_id=node_id,
+        )
-    flow_graph.add_read(read_node)
+def _build_polars_code_args(
+    source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
+    separator: str,
+    has_header: bool,
+    new_columns: Optional[List[str]],
+    comment_prefix: Optional[str],
+    quote_char: Optional[str],
+    skip_rows: int,
+    skip_lines: int,
+    schema: Optional[SchemaDict],
+    schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]],
+    null_values: Optional[Union[str, List[str], Dict[str, str]]],
+    missing_utf8_is_empty_string: bool,
+    ignore_errors: bool,
+    try_parse_dates: bool,
+    infer_schema_length: Optional[int],
+    n_rows: Optional[int],
+    encoding: CsvEncoding,
+    low_memory: bool,
+    rechunk: bool,
+    storage_options: Optional[Dict[str, Any]],
+    skip_rows_after_header: int,
+    row_index_name: Optional[str],
+    row_index_offset: int,
+    eol_char: str,
+    raise_if_empty: bool,
+    truncate_ragged_lines: bool,
+    decimal_comma: bool,
+    glob: bool,
+    cache: bool,
+    with_column_names: Optional[Callable[[List[str]], List[str]]],
+    **other_options: Any
+) -> str:
+    source_repr: str
+    if isinstance(source, (str, Path)):
+        source_repr = repr(str(source))
+    elif isinstance(source, list):
+        source_repr = repr([str(p) for p in source])
+    elif isinstance(source, bytes):
+        source_repr = "source_bytes_obj"
+    elif hasattr(source, 'read'):
+        source_repr = "source_file_like_obj"
+    else:
+        source_repr = repr(source)
+    param_mapping = {
+        'has_header': (True, lambda x: str(x)),
+        'separator': (',', lambda x: repr(str(x))),
+        'comment_prefix': (None, lambda x: repr(str(x)) if x is not None else 'None'),
+        'quote_char': ('"', lambda x: repr(str(x)) if x is not None else 'None'),
+        'skip_rows': (0, str),
+        'skip_lines': (0, str),
+        'schema': (None, lambda x: repr(x) if x is not None else 'None'),
+        'schema_overrides': (None, lambda x: repr(x) if x is not None else 'None'),
+        'null_values': (None, lambda x: repr(x) if x is not None else 'None'),
+        'missing_utf8_is_empty_string': (False, str),
+        'ignore_errors': (False, str),
+        'cache': (True, str),
+        'with_column_names': (None, lambda x: repr(x) if x is not None else 'None'),
+        'infer_schema_length': (100, lambda x: str(x) if x is not None else 'None'),
+        'n_rows': (None, lambda x: str(x) if x is not None else 'None'),
+        'encoding': ('utf8', lambda x: repr(str(x))),
+        'low_memory': (False, str),
+        'rechunk': (False, str),
+        'skip_rows_after_header': (0, str),
+        'row_index_name': (None, lambda x: repr(str(x)) if x is not None else 'None'),
+        'row_index_offset': (0, str),
+        'try_parse_dates': (False, str),
+        'eol_char': ('\n', lambda x: repr(str(x))),
+        'new_columns': (None, lambda x: repr(x) if x is not None else 'None'),
+        'raise_if_empty': (True, str),
+        'truncate_ragged_lines': (False, str),
+        'decimal_comma': (False, str),
+        'glob': (True, str),
+        'storage_options': (None, lambda x: repr(x) if x is not None else 'None'),
+    }
+    all_vars = locals()
+    kwargs_list = []
+    for param_name_key, (default_value, format_func) in param_mapping.items():
+        value = all_vars.get(param_name_key)
+        formatted_value = format_func(value)
+        kwargs_list.append(f"{param_name_key}={formatted_value}")
+    if other_options:
+        for k, v in other_options.items():
+            kwargs_list.append(f"{k}={repr(v)}")
+    kwargs_str = ",\n    ".join(kwargs_list)
+    if kwargs_str:
+        polars_code = f"output_df = pl.scan_csv(\n    {source_repr},\n    {kwargs_str}\n)"
+    else:
+        polars_code = f"output_df = pl.scan_csv({source_repr})"
-    return FlowFrame(
-        data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
-        flow_graph=flow_graph,
-        node_id=node_id
-    )
+    return polars_code
 def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
@@ -2091,3 +2385,113 @@ def concat(frames: List['FlowFrame'],
     return first_frame.concat(remaining_frames, how=how,
                               rechunk=rechunk, parallel=parallel,
                               description=description)
+def scan_csv(
+        source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
+        *,
+        flow_graph: Optional[Any] = None,  # Using Any for FlowGraph placeholder
+        separator: str = ',',
+        convert_to_absolute_path: bool = True,
+        description: Optional[str] = None,
+        has_header: bool = True,
+        new_columns: Optional[List[str]] = None,
+        comment_prefix: Optional[str] = None,
+        quote_char: Optional[str] = '"',
+        skip_rows: int = 0,
+        skip_lines: int = 0,
+        schema: Optional[SchemaDict] = None,
+        schema_overrides: Optional[Union[SchemaDict, Sequence[PolarsDataType]]] = None,
+        null_values: Optional[Union[str, List[str], Dict[str, str]]] = None,
+        missing_utf8_is_empty_string: bool = False,
+        ignore_errors: bool = False,
+        try_parse_dates: bool = False,
+        infer_schema: bool = True,
+        infer_schema_length: Optional[int] = 100,
+        n_rows: Optional[int] = None,
+        encoding: CsvEncoding = 'utf8',
+        low_memory: bool = False,
+        rechunk: bool = False,
+        storage_options: Optional[Dict[str, Any]] = None,
+        skip_rows_after_header: int = 0,
+        row_index_name: Optional[str] = None,
+        row_index_offset: int = 0,
+        eol_char: str = '\n',
+        raise_if_empty: bool = True,
+        truncate_ragged_lines: bool = False,
+        decimal_comma: bool = False,
+        glob: bool = True,
+        cache: bool = True,
+        with_column_names: Optional[Callable[[List[str]], List[str]]] = None,
+        **other_options: Any
+) -> FlowFrame:
+    """
+    Scan a CSV file into a FlowFrame. This function is an alias for read_csv.
+    This method is the same as read_csv but is provided for compatibility with
+    the polars API where scan_csv returns a LazyFrame.
+    See read_csv for full documentation.
+    """
+    return read_csv(
+        source=source,
+        flow_graph=flow_graph,
+        separator=separator,
+        convert_to_absolute_path=convert_to_absolute_path,
+        description=description,
+        has_header=has_header,
+        new_columns=new_columns,
+        comment_prefix=comment_prefix,
+        quote_char=quote_char,
+        skip_rows=skip_rows,
+        skip_lines=skip_lines,
+        schema=schema,
+        schema_overrides=schema_overrides,
+        null_values=null_values,
+        missing_utf8_is_empty_string=missing_utf8_is_empty_string,
+        ignore_errors=ignore_errors,
+        try_parse_dates=try_parse_dates,
+        infer_schema=infer_schema,
+        infer_schema_length=infer_schema_length,
+        n_rows=n_rows,
+        encoding=encoding,
+        low_memory=low_memory,
+        rechunk=rechunk,
+        storage_options=storage_options,
+        skip_rows_after_header=skip_rows_after_header,
+        row_index_name=row_index_name,
+        row_index_offset=row_index_offset,
+        eol_char=eol_char,
+        raise_if_empty=raise_if_empty,
+        truncate_ragged_lines=truncate_ragged_lines,
+        decimal_comma=decimal_comma,
+        glob=glob,
+        cache=cache,
+        with_column_names=with_column_names,
+        **other_options
+    )
+def scan_parquet(
+        file_path,
+        *,
+        flow_graph: FlowGraph = None,
+        description: str = None,
+        convert_to_absolute_path: bool = True,
+        **options
+) -> FlowFrame:
+    """
+    Scan a Parquet file into a FlowFrame. This function is an alias for read_parquet.
+    This method is the same as read_parquet but is provided for compatibility with
+    the polars API where scan_parquet returns a LazyFrame.
+    See read_parquet for full documentation.
+    """
+    return read_parquet(
+        file_path=file_path,
+        flow_graph=flow_graph,
+        description=description,
+        convert_to_absolute_path=convert_to_absolute_path,
+        **options
+    )

Flowfile 0.3.1.2__py3-none-any.whl → 0.3.2__py3-none-any.whl

Flowfile 0.3.1.2py3-none-any.whl → 0.3.2py3-none-any.whl