PyPI - Flowfile - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

Flowfile 0.3.6py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of Flowfile might be problematic. Click here for more details.

Files changed (98) hide show

flowfile_frame/flow_frame_methods.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import io
 import os
 from pathlib import Path
-from typing import Any, List, Optional, Union, Dict, Callable
+from typing import Any, List, Optional, Union, Dict, Callable, Literal
 import polars as pl
 from polars._typing import (SchemaDict, IO, PolarsDataType,
@@ -9,12 +9,13 @@ from polars._typing import (SchemaDict, IO, PolarsDataType,
 from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
 from flowfile_core.flowfile.flow_graph import FlowGraph
-from flowfile_core.schemas import input_schema, transform_schema
+from flowfile_core.schemas import input_schema, transform_schema, cloud_storage_schemas
 from flowfile_frame.config import logger
 from flowfile_frame.expr import col
-from flowfile_frame.flow_frame import generate_node_id, FlowFrame
+from flowfile_frame.flow_frame import FlowFrame
 from flowfile_frame.utils import create_flow_graph
+from flowfile_frame.cloud_storage.secret_manager import get_current_user_id
+from flowfile_frame.utils import generate_node_id
 def sum(expr):
     """Sum aggregation function."""
@@ -278,6 +279,7 @@ def read_csv(
             node_id=node_id,
         )
 def _build_polars_code_args(
     source: Union[str, Path, IO[bytes], bytes, List[Union[str, Path, IO[bytes], bytes]]],
     separator: str,
@@ -377,13 +379,13 @@ def _build_polars_code_args(
     return polars_code
-def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
+def read_parquet(source, *, flow_graph: FlowGraph = None, description: str = None,
                  convert_to_absolute_path: bool = True, **options) -> FlowFrame:
     """
     Read a Parquet file into a FlowFrame.
     Args:
-        file_path: Path to Parquet file
+        source: Path to Parquet file
         flow_graph: if you want to add it to an existing graph
         description: if you want to add a readable name in the frontend (advised)
         convert_to_absolute_path: If the path needs to be set to a fixed location
@@ -392,8 +394,8 @@ def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str =
     Returns:
         A FlowFrame with the Parquet data
     """
-    if '~' in file_path:
-        file_path = os.path.expanduser(file_path)
+    if '~' in source:
+        file_path = os.path.expanduser(source)
     node_id = generate_node_id()
     if flow_graph is None:
@@ -403,8 +405,8 @@ def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str =
     received_table = input_schema.ReceivedTable(
         file_type='parquet',
-        path=file_path,
-        name=Path(file_path).name,
+        path=source,
+        name=Path(source).name,
     )
     if convert_to_absolute_path:
         received_table.path = received_table.abs_file_path
@@ -592,7 +594,7 @@ def scan_csv(
 def scan_parquet(
-        file_path,
+        source,
         *,
         flow_graph: FlowGraph = None,
         description: str = None,
@@ -608,10 +610,146 @@ def scan_parquet(
     See read_parquet for full documentation.
     """
     return read_parquet(
-        file_path=file_path,
+        source=source,
         flow_graph=flow_graph,
         description=description,
         convert_to_absolute_path=convert_to_absolute_path,
         **options
     )
+def scan_parquet_from_cloud_storage(
+        source: str,
+        *,
+        flow_graph: Optional[FlowGraph] = None,
+        connection_name: Optional[str] = None,
+        scan_mode: Literal["single_file", "directory", None] = None,
+        description: Optional[str] = None
+) -> FlowFrame:
+    node_id = generate_node_id()
+    if scan_mode is None:
+        if source[-1] in ("*", "/"):
+            scan_mode: Literal["single_file", "directory"] = "directory"
+        else:
+            scan_mode: Literal["single_file", "directory"] = "single_file"
+    if flow_graph is None:
+        flow_graph = create_flow_graph()
+    flow_id = flow_graph.flow_id
+    settings = input_schema.NodeCloudStorageReader(
+        flow_id=flow_id,
+        node_id=node_id,
+        cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
+                                                                              scan_mode=scan_mode,
+                                                                              connection_name=connection_name,
+                                                                              file_format="parquet"),
+        user_id=get_current_user_id(),
+        description=description)
+    flow_graph.add_cloud_storage_reader(settings)
+    return FlowFrame(
+        data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
+        flow_graph=flow_graph,
+        node_id=node_id
+    )
+def scan_csv_from_cloud_storage(
+        source: str,
+        *,
+        flow_graph: Optional[FlowGraph] = None,
+        connection_name: Optional[str] = None,
+        scan_mode: Literal["single_file", "directory", None] = None,
+        delimiter: str = ";",
+        has_header: Optional[bool] = True,
+        encoding: Optional[CsvEncoding] = "utf8") -> FlowFrame:
+    node_id = generate_node_id()
+    if scan_mode is None:
+        if source[-1] in ("*", "/"):
+            scan_mode: Literal["single_file", "directory"] = "directory"
+        else:
+            scan_mode: Literal["single_file", "directory"] = "single_file"
+    if flow_graph is None:
+        flow_graph = create_flow_graph()
+    flow_id = flow_graph.flow_id
+    settings = input_schema.NodeCloudStorageReader(
+        flow_id=flow_id,
+        node_id=node_id,
+        cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
+                                                                              scan_mode=scan_mode,
+                                                                              connection_name=connection_name,
+                                                                              csv_delimiter=delimiter,
+                                                                              csv_encoding=encoding,
+                                                                              csv_has_header=has_header,
+                                                                              file_format="csv"),
+        user_id=get_current_user_id())
+    flow_graph.add_cloud_storage_reader(settings)
+    return FlowFrame(
+        data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
+        flow_graph=flow_graph,
+        node_id=node_id
+    )
+def scan_delta(
+        source: str,
+        *,
+        flow_graph: Optional[FlowGraph] = None,
+        connection_name: Optional[str] = None,
+        version: int = None) -> FlowFrame:
+    node_id = generate_node_id()
+    if flow_graph is None:
+        flow_graph = create_flow_graph()
+    flow_id = flow_graph.flow_id
+    settings = input_schema.NodeCloudStorageReader(
+        flow_id=flow_id,
+        node_id=node_id,
+        cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
+                                                                              connection_name=connection_name,
+                                                                              file_format="delta",
+                                                                              delta_version=version),
+        user_id=get_current_user_id())
+    flow_graph.add_cloud_storage_reader(settings)
+    return FlowFrame(
+        data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
+        flow_graph=flow_graph,
+        node_id=node_id
+    )
+def scan_json_from_cloud_storage(
+        source: str,
+        *,
+        flow_graph: Optional[FlowGraph] = None,
+        connection_name: Optional[str] = None,
+        scan_mode: Literal["single_file", "directory", None] = None,
+) -> FlowFrame:
+    node_id = generate_node_id()
+    if scan_mode is None:
+        if source[-1] in ("*", "/"):
+            scan_mode: Literal["single_file", "directory"] = "directory"
+        else:
+            scan_mode: Literal["single_file", "directory"] = "single_file"
+    if flow_graph is None:
+        flow_graph = create_flow_graph()
+    flow_id = flow_graph.flow_id
+    settings = input_schema.NodeCloudStorageReader(
+        flow_id=flow_id,
+        node_id=node_id,
+        cloud_storage_settings=cloud_storage_schemas.CloudStorageReadSettings(resource_path=source,
+                                                                              scan_mode=scan_mode,
+                                                                              connection_name=connection_name,
+                                                                              file_format="json"),
+        user_id=get_current_user_id())
+    flow_graph.add_cloud_storage_reader(settings)
+    return FlowFrame(
+        data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
+        flow_graph=flow_graph,
+        node_id=node_id
+    )

flowfile_frame/group_frame.py CHANGED Viewed

@@ -91,6 +91,8 @@ class GroupByFrame:
             if isinstance(col_expr, str):
                 agg_cols.append(transform_schema.AggColl(old_name=col_expr, agg="groupby"))
             elif isinstance(col_expr, Expr):
+                if col_expr.is_complex:
+                    return False
                 agg_cols.append(transform_schema.AggColl(old_name=col_expr.column_name, agg="groupby"))
             elif isinstance(col_expr, Selector):
                 return False
@@ -151,6 +153,7 @@ class GroupByFrame:
     def _create_agg_node(self, node_id_to_use: int, can_be_converted: bool, agg_cols: list, agg_expressions,
                          named_agg_exprs, convertable_to_code: bool, description: str):
         """Create node for explicit aggregations via self.agg()."""
         if can_be_converted:
             group_by_settings = input_schema.NodeGroupBy(
                 flow_id=self.parent.flow_graph.flow_id,

flowfile_frame/utils.py CHANGED Viewed

@@ -88,14 +88,23 @@ def _generate_id() -> int:
     return int(uuid.uuid4().int % 100000)
-def create_flow_graph() -> FlowGraph:
-    flow_id = _generate_id()
+def create_flow_graph(flow_id: int = None) -> FlowGraph:
+    """
+    Create a new FlowGraph instance with a unique flow ID.
+    Parameters
+       - flow_id (int): Optional flow ID. If not provided, a new unique ID will be generated.
+    Returns
+       - FlowGraph: A new instance of FlowGraph with the specified or generated flow ID.
+    """
+    if flow_id is None:
+        flow_id = _generate_id()
     flow_settings = schemas.FlowSettings(
         flow_id=flow_id,
         name=f"Flow_{flow_id}",
         path=f"flow_{flow_id}"
     )
-    flow_graph = FlowGraph(flow_id=flow_id, flow_settings=flow_settings)
+    flow_graph = FlowGraph(flow_settings=flow_settings)
     flow_graph.flow_settings.execution_location = 'local'  # always create a local frame so that the run time does not attempt to use the flowfile_worker process
     return flow_graph
@@ -119,3 +128,16 @@ def stringify_values(v: Any) -> str:
     else:
         # Handle any other types
         return str(v)
+data = {"c": 0}
+def generate_node_id() -> int:
+    data["c"] += 1
+    return data["c"]
+def set_node_id(node_id):
+    """Set the node ID to a specific value."""
+    data["c"] = node_id

test_utils/s3/data_generator.py CHANGED Viewed

@@ -24,6 +24,7 @@ MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
 MINIO_CONTAINER_NAME = os.environ.get("TEST_MINIO_CONTAINER", "test-minio-s3")
 MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
 def _create_single_csv_file(s3_client, df: pl.DataFrame, bucket_name: str):
     """Creates a single CSV file from a DataFrame and uploads it to S3."""
     logger.info("Writing single-file CSV...")

test_utils/s3/demo_data_generator.py ADDED Viewed

@@ -0,0 +1,186 @@
+import logging
+import io
+import os
+import tempfile
+import shutil
+import random
+from datetime import datetime, timedelta
+# Third-party libraries
+import boto3
+from botocore.client import Config
+import polars as pl
+import pyarrow as pa
+from pyarrow import parquet as pq
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# --- MinIO/S3 Configuration ---
+MINIO_HOST = os.environ.get("TEST_MINIO_HOST", "localhost")
+MINIO_PORT = int(os.environ.get("TEST_MINIO_PORT", 9000))
+MINIO_ACCESS_KEY = os.environ.get("TEST_MINIO_ACCESS_KEY", "minioadmin")
+MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
+MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
+# --- Data Generation Functions ---
+def _create_sales_data(s3_client, df: pl.DataFrame, bucket_name: str):
+    """
+    Creates partitioned Parquet files for the sales data based on year and month.
+    s3://data-lake/sales/year=YYYY/month=MM/
+    """
+    logger.info("Writing partitioned sales data...")
+    # Use Polars' built-in partitioning
+    # A temporary local directory is needed to stage the partitioned files before uploading
+    with tempfile.TemporaryDirectory() as temp_dir:
+        df.write_parquet(
+            temp_dir,
+            use_pyarrow=True,
+            pyarrow_options={"partition_cols": ["year", "month"]}
+        )
+        # Walk through the local directory and upload files to S3
+        for root, _, files in os.walk(temp_dir):
+            for file in files:
+                if file.endswith(".parquet"):
+                    local_path = os.path.join(root, file)
+                    # Construct the S3 key to match the desired structure
+                    relative_path = os.path.relpath(local_path, temp_dir)
+                    s3_key = f"data-lake/sales/{relative_path.replace(os.path.sep, '/')}"
+                    s3_client.upload_file(local_path, bucket_name, s3_key)
+    logger.info(f"Finished writing sales data to s3://{bucket_name}/data-lake/sales/")
+def _create_customers_data(s3_client, df: pl.DataFrame, bucket_name: str):
+    """
+    Creates a Parquet file for the customers data.
+    s3://data-lake/customers/
+    """
+    logger.info("Writing customers Parquet data...")
+    parquet_buffer = io.BytesIO()
+    df.write_parquet(parquet_buffer)
+    parquet_buffer.seek(0)
+    s3_client.put_object(
+        Bucket=bucket_name,
+        Key='data-lake/customers/customers.parquet',
+        Body=parquet_buffer.getvalue()
+    )
+    logger.info(f"Finished writing customers data to s3://{bucket_name}/data-lake/customers/")
+def _create_orders_data(s3_client, df: pl.DataFrame, bucket_name: str):
+    """
+    Creates a pipe-delimited CSV file for the orders data.
+    s3://raw-data/orders/
+    """
+    logger.info("Writing orders CSV data...")
+    csv_buffer = io.BytesIO()
+    # Write with pipe delimiter and header
+    df.write_csv(csv_buffer, separator="|")
+    csv_buffer.seek(0)
+    s3_client.put_object(
+        Bucket=bucket_name,
+        Key='raw-data/orders/orders.csv',
+        Body=csv_buffer.getvalue()
+    )
+    logger.info(f"Finished writing orders data to s3://{bucket_name}/raw-data/orders/")
+def _create_products_data(df: pl.DataFrame):
+    """
+    Creates a local Parquet file for the products data.
+    """
+    logger.info("Writing local products Parquet data...")
+    # Create a directory for local data if it doesn't exist
+    local_data_dir = "local_data"
+    os.makedirs(local_data_dir, exist_ok=True)
+    file_path = os.path.join(local_data_dir, "local_products.parquet")
+    df.write_parquet(file_path)
+    logger.info(f"Finished writing products data to {file_path}")
+def create_demo_data(endpoint_url: str, access_key: str, secret_key: str, bucket_name: str):
+    """
+    Populates a MinIO bucket with test data matching the schemas from the examples.
+    """
+    logger.info("🚀 Starting data population for flowfile examples...")
+    s3_client = boto3.client(
+        's3',
+        endpoint_url=endpoint_url,
+        aws_access_key_id=access_key,
+        aws_secret_access_key=secret_key,
+        config=Config(signature_version='s3v4'),
+        region_name='us-east-1'
+    )
+    # --- Generate Core DataFrames ---
+    DATA_SIZE = 15_000 # Increased data size for more variety
+    START_DATE = datetime(2022, 1, 1)
+    END_DATE = datetime(2024, 12, 31)
+    TOTAL_DAYS = (END_DATE - START_DATE).days
+    # States for region mapping
+    states = ["CA", "OR", "WA", "NY", "NJ", "PA", "TX", "FL", "GA", "IL", "OH", "MI"]
+    # Generate base sales data across multiple years
+    sales_data = {
+        "order_id": range(1, DATA_SIZE + 1),
+        "customer_id": [random.randint(100, 299) for _ in range(DATA_SIZE)],
+        "product_id": [random.randint(1, 100) for _ in range(DATA_SIZE)],
+        "order_date": [START_DATE + timedelta(days=random.randint(0, TOTAL_DAYS)) for _ in range(DATA_SIZE)],
+        "quantity": [random.randint(1, 5) for _ in range(DATA_SIZE)],
+        "unit_price": [round(random.uniform(10.0, 500.0), 2) for _ in range(DATA_SIZE)],
+        "discount_rate": [random.choice([0.0, 0.1, 0.15, 0.2, None]) for _ in range(DATA_SIZE)],
+        "status": [random.choice(["completed", "pending", "cancelled"]) for _ in range(DATA_SIZE)],
+        "customer_lifetime_value": [random.uniform(500, 20000) for _ in range(DATA_SIZE)],
+        "state": [random.choice(states) for _ in range(DATA_SIZE)],
+    }
+    sales_df = pl.from_dict(sales_data).with_columns([
+        pl.col("order_date").dt.year().alias("year"),
+        pl.col("order_date").dt.month().alias("month"),
+        # The 'amount' column in the example seems to be the price before discount
+        pl.col("unit_price").alias("amount")
+    ])
+    # Generate customers DataFrame
+    unique_customer_ids = sales_df["customer_id"].unique().to_list()
+    customers_df = pl.DataFrame({
+        "customer_id": unique_customer_ids,
+        "customer_segment": [random.choice(["VIP", "Regular", "New"]) for _ in unique_customer_ids]
+    })
+    # Generate products DataFrame
+    unique_product_ids = sales_df["product_id"].unique().to_list()
+    # Create a map of product_id to unit_price from the first occurrence in sales_df
+    product_price_map = sales_df.group_by("product_id").agg(pl.first("unit_price")).to_dict(as_series=False)
+    price_dict = dict(zip(product_price_map['product_id'], product_price_map['unit_price']))
+    products_df = pl.DataFrame({
+        "product_id": unique_product_ids,
+        "product_category": [random.choice(["Electronics", "Books", "Clothing", "Home Goods"]) for _ in unique_product_ids],
+        "unit_price": [price_dict.get(pid) for pid in unique_product_ids]
+    })
+    # Generate orders DataFrame for the CSV file (subset of sales)
+    orders_df = sales_df.select(["customer_id", "product_id", "quantity", "discount_rate"])
+    logger.info(f"Generated {len(sales_df)} sales records across {sales_df['year'].n_unique()} years, for {len(customers_df)} customers, and {len(products_df)} products.")
+    # --- Write Data to S3 and Local Filesystem ---
+    _create_sales_data(s3_client, sales_df, bucket_name)
+    _create_customers_data(s3_client, customers_df, bucket_name)
+    _create_orders_data(s3_client, orders_df, bucket_name)
+    _create_products_data(products_df)
+    logger.info("✅ All test data populated successfully.")
+if __name__ == '__main__':
+    # The bucket that will be created and populated
+    BUCKET = "flowfile-demo-data"
+    create_demo_data(
+        endpoint_url=MINIO_ENDPOINT_URL,
+        access_key=MINIO_ACCESS_KEY,
+        secret_key=MINIO_SECRET_KEY,
+        bucket_name=BUCKET
+    )

test_utils/s3/fixtures.py CHANGED Viewed

@@ -8,6 +8,7 @@ import shutil
 import boto3
 from botocore.client import Config
 from test_utils.s3.data_generator import populate_test_data
+from test_utils.s3.demo_data_generator import create_demo_data
 logger = logging.getLogger("s3_fixture")
@@ -102,7 +103,7 @@ def create_test_buckets():
     client = get_minio_client()
     # Create test buckets
-    buckets = ['test-bucket', 'flowfile-test', 'sample-data', 'worker-test-bucket']
+    buckets = ['test-bucket', 'flowfile-test', 'sample-data', 'worker-test-bucket', 'demo-bucket']
     for bucket in buckets:
         try:
             client.create_bucket(Bucket=bucket)
@@ -176,6 +177,10 @@ def start_minio_container() -> bool:
                                access_key=MINIO_ACCESS_KEY,
                                secret_key=MINIO_SECRET_KEY,
                                bucket_name="test-bucket")
+            create_demo_data(endpoint_url=MINIO_ENDPOINT_URL,
+                               access_key=MINIO_ACCESS_KEY,
+                               secret_key=MINIO_SECRET_KEY,
+                               bucket_name="demo-bucket")
             return True
         return False

flowfile_core/schemas/defaults.py DELETED Viewed

@@ -1,9 +0,0 @@
-from schemas import transform_schema
-from pydantic import Field, BaseModel
-default_union_input = transform_schema.UnionInput
-class F(BaseModel):
-    f: transform_schema.UnionInput = Field(default_factory=default_union_input)

Flowfile 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

Potentially problematic release.

Flowfile 0.3.6py3-none-any.whl → 0.3.7py3-none-any.whl