PyPI - Flowfile - Versions diffs - 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl - Mend

Flowfile 0.3.5py3-none-any.whl → 0.3.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of Flowfile might be problematic. Click here for more details.

Files changed (145) hide show

flowfile_worker/funcs.py CHANGED Viewed

@@ -6,7 +6,9 @@ from flowfile_worker.polars_fuzzy_match.matcher import fuzzy_match_dfs
 from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
 from flowfile_worker.flow_logger import get_worker_logger
 from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
-from flowfile_worker.external_sources.sql_source.main import write_serialized_df_to_database, write_df_to_database
+from flowfile_worker.external_sources.sql_source.main import write_df_to_database
+from flowfile_worker.external_sources.s3_source.main import write_df_to_cloud
+from flowfile_worker.external_sources.s3_source.models import CloudStorageWriteSettings
 from base64 import encodebytes
 from logging import Logger
 import logging
@@ -205,9 +207,9 @@ def execute_write_method(write_method: Callable, path: str, data_type: str = Non
         logger.info('Writing as csv file')
         if write_mode == 'append':
             with open(path, 'ab') as f:
-                write_method(file=f, separator=delimiter, quote_style='always')
+                write_method(f, separator=delimiter, quote_style='always')
         else:
-            write_method(file=path, separator=delimiter, quote_style='always')
+            write_method(path, separator=delimiter, quote_style='always')
     elif data_type == 'parquet':
         logger.info('Writing as parquet file')
         write_method(path)
@@ -243,6 +245,49 @@ def write_to_database(polars_serializable_object: bytes,
             progress.value = -1
+def write_to_cloud_storage(polars_serializable_object: bytes,
+                           progress: Value,
+                           error_message: Array,
+                           queue: Queue,
+                           file_path: str,
+                           cloud_write_settings: CloudStorageWriteSettings,
+                           flowfile_flow_id: int = -1,
+                           flowfile_node_id: int | str = -1
+                           ) -> None:
+    """
+    Writes a Polars DataFrame to cloud storage using the provided settings.
+    Args:
+        polars_serializable_object ():  # Serialized Polars DataFrame object
+        progress (): Multiprocessing Value to track progress
+        error_message (): Array to store error messages
+        queue (): Queue to send results back
+        file_path (): Path to the file where the DataFrame will be written
+        cloud_write_settings (): CloudStorageWriteSettings object containing write settings and connection details
+        flowfile_flow_id (): Flowfile flow ID for logging
+        flowfile_node_id (): Flowfile node ID for logging
+    Returns:
+        None
+    """
+    flowfile_logger = get_worker_logger(flowfile_flow_id, flowfile_node_id)
+    flowfile_logger.info(f"Starting write operation to: {cloud_write_settings.write_settings.resource_path}")
+    df = pl.LazyFrame.deserialize(io.BytesIO(polars_serializable_object))
+    flowfile_logger.info(f"Starting to sync the data to cloud, execution plan: \n"
+                         f"{df.explain(format='plain')}")
+    try:
+        write_df_to_cloud(df, cloud_write_settings, flowfile_logger)
+        flowfile_logger.info("Write operation completed successfully")
+        with progress.get_lock():
+            progress.value = 100
+    except Exception as e:
+        error_msg = str(e).encode()[:1024]
+        flowfile_logger.error(f'Error during write operation: {str(e)}')
+        with error_message.get_lock():
+            error_message[:len(error_msg)] = error_msg
+        with progress.get_lock():
+            progress.value = -1
 def write_output(polars_serializable_object: bytes,
                  progress: Value,
                  error_message: Array,
@@ -263,16 +308,16 @@ def write_output(polars_serializable_object: bytes,
         if isinstance(df, pl.LazyFrame):
             flowfile_logger.info(f'Execution plan explanation:\n{df.explain(format="plain")}')
         flowfile_logger.info("Successfully deserialized dataframe")
-        is_lazy = False
         sink_method_str = 'sink_'+data_type
         write_method_str = 'write_'+data_type
         has_sink_method = hasattr(df, sink_method_str)
         write_method = None
         if os.path.exists(path) and write_mode == 'create':
             raise Exception('File already exists')
-        if has_sink_method and is_lazy:
+        if has_sink_method and write_method != 'append':
+            flowfile_logger.info(f'Using sink method: {sink_method_str}')
             write_method = getattr(df, 'sink_' + data_type)
-        elif not is_lazy or not has_sink_method:
+        elif not has_sink_method:
             if isinstance(df, pl.LazyFrame):
                 df = collect_lazy_frame(df)
             write_method = getattr(df, write_method_str)

flowfile_worker/models.py CHANGED Viewed

@@ -3,11 +3,12 @@ from typing import Optional, Literal, Any
 from base64 import decodebytes
 from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
 from flowfile_worker.external_sources.sql_source.models import DatabaseWriteSettings
+from flowfile_worker.external_sources.s3_source.models import CloudStorageWriteSettings
 OperationType = Literal[
     'store', 'calculate_schema', 'calculate_number_of_records', 'write_output', 'fuzzy', 'store_sample',
-    'write_to_database']
+    'write_to_database', "write_to_cloud_storage",]
 ResultType = Literal['polars', 'other']
@@ -55,7 +56,6 @@ class DatabaseScriptWrite(DatabaseWriteSettings):
         Returns:
             DatabaseWriteSettings: The corresponding DatabaseWriteSettings object.
         """
         return DatabaseWriteSettings(
             connection=self.connection,
             table_name=self.table_name,
@@ -65,6 +65,26 @@ class DatabaseScriptWrite(DatabaseWriteSettings):
         )
+class CloudStorageScriptWrite(CloudStorageWriteSettings):
+    operation: bytes
+    def polars_serializable_object(self):
+        return decodebytes(self.operation)
+    def get_cloud_storage_write_settings(self) -> CloudStorageWriteSettings:
+        """
+        Converts the current instance to a DatabaseWriteSettings object.
+        Returns:
+            DatabaseWriteSettings: The corresponding DatabaseWriteSettings object.
+        """
+        return CloudStorageWriteSettings(
+            write_settings=self.write_settings,
+            connection=self.connection,
+            flowfile_flow_id=self.flowfile_flow_id,
+            flowfile_node_id=self.flowfile_node_id
+        )
 class FuzzyJoinInput(BaseModel):
     task_id: Optional[str] = None
     cache_dir: Optional[str] = None

flowfile_worker/routes.py CHANGED Viewed

@@ -10,10 +10,8 @@ from flowfile_worker import models
 from flowfile_worker.spawner import start_process, start_fuzzy_process, start_generic_process, process_manager
 from flowfile_worker.create import table_creator_factory_method, received_table_parser, FileType
 from flowfile_worker.configs import logger
-from flowfile_worker.external_sources.airbyte_sources.models import AirbyteSettings
 from flowfile_worker.external_sources.sql_source.models import DatabaseReadSettings
 from flowfile_worker.external_sources.sql_source.main import read_sql_source, write_serialized_df_to_database
-from flowfile_worker.external_sources.airbyte_sources.main import read_airbyte_source
 router = APIRouter()
@@ -74,6 +72,44 @@ def store_sample(polars_script: models.PolarsScriptSample, background_tasks: Bac
         raise HTTPException(status_code=500, detail=str(e))
+@router.post("/write_data_to_cloud/")
+def write_data_to_cloud(cloud_storage_script_write: models.CloudStorageScriptWrite,
+                        background_tasks: BackgroundTasks) -> models.Status:
+    """
+    Write polars dataframe to a file in cloud storage.
+    Args:
+        cloud_storage_script_write (): Contains dataframe and write options for cloud storage
+        background_tasks (): FastAPI background tasks handler
+    Returns:
+        models.Status: Status object tracking the write operation
+    """
+    try:
+        logger.info("Starting write operation to: cloud storage")
+        task_id = str(uuid.uuid4())
+        polars_serializable_object = cloud_storage_script_write.polars_serializable_object()
+        status = models.Status(background_task_id=task_id, status="Starting", file_ref='',
+                               result_type="other")
+        status_dict[task_id] = status
+        background_tasks.add_task(
+            start_process,
+            polars_serializable_object=polars_serializable_object,
+            task_id=task_id,
+            operation="write_to_cloud_storage",
+            file_ref='',
+            flowfile_flow_id=cloud_storage_script_write.flowfile_flow_id,
+            flowfile_node_id=cloud_storage_script_write.flowfile_node_id,
+            kwargs=dict(cloud_write_settings=cloud_storage_script_write.get_cloud_storage_write_settings()),
+        )
+        logger.info(
+            f"Started write task: {task_id} to database"
+        )
+        return status
+    except Exception as e:
+        logger.error(f"Error in write operation: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
 @router.post('/store_database_write_result/')
 def store_in_database(database_script_write: models.DatabaseScriptWrite, background_tasks: BackgroundTasks) -> models.Status:
     """
@@ -158,44 +194,10 @@ def write_results(polars_script_write: models.PolarsScriptWrite, background_task
         raise HTTPException(status_code=500, detail=str(e))
-@router.post('/store_airbyte_result')
-def store_airbyte_result(airbyte_settings: AirbyteSettings, background_tasks: BackgroundTasks) -> models.Status:
-    """
-    Store the result of an Airbyte source operation.
-    Args:
-        airbyte_settings (AirbyteSettings): Settings for the Airbyte source operation
-        background_tasks (BackgroundTasks): FastAPI background tasks handler
-    Returns:
-        models.Status: Status object tracking the Airbyte source operation
-    """
-    logger.info("Processing Airbyte source operation")
-    try:
-        task_id = str(uuid.uuid4())
-        file_path = os.path.join(CACHE_DIR.name, f"{task_id}.arrow")
-        status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_path,
-                               result_type="polars")
-        status_dict[task_id] = status
-        logger.info(f"Starting Airbyte source task: {task_id}")
-        background_tasks.add_task(start_generic_process, func_ref=read_airbyte_source, file_ref=file_path,
-                                  flowfile_flow_id=airbyte_settings.flowfile_flow_id,
-                                  flowfile_node_id=airbyte_settings.flowfile_node_id,
-                                  task_id=task_id, kwargs=dict(airbyte_settings=airbyte_settings))
-        logger.info(f"Started Airbyte source task: {task_id}")
-        return status
-    except Exception as e:
-        logger.error(f"Error processing Airbyte source: {str(e)}", exc_info=True)
-        raise HTTPException(status_code=500, detail=str(e))
 @router.post('/store_database_read_result')
 def store_sql_db_result(database_read_settings: DatabaseReadSettings, background_tasks: BackgroundTasks) -> models.Status:
     """
-    Store the result of an Airbyte source operation.
+    Store the result of an sql source operation.
     Args:
         database_read_settings (SQLSourceSettings): Settings for the SQL source operation
@@ -204,7 +206,7 @@ def store_sql_db_result(database_read_settings: DatabaseReadSettings, background
     Returns:
         models.Status: Status object tracking the Sql operation
     """
-    logger.info("Processing Airbyte source operation")
+    logger.info("Processing Sql source operation")
     try:
         task_id = str(uuid.uuid4())

flowfile_worker/utils.py CHANGED Viewed

@@ -7,7 +7,7 @@ def collect_lazy_frame(lf: pl.LazyFrame) -> pl.DataFrame:
     try:
         return lf.collect(engine="streaming")
     except PanicException:
-        return lf.collect(engine="auto")
+        return lf.collect(engine="in-memory")
 @dataclass

test_utils/s3/commands.py ADDED Viewed

@@ -0,0 +1,46 @@
+import logging
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger("postgres_commands")
+def start_minio():
+    """Start MinIO container for S3 testing"""
+    from . import fixtures
+    if not fixtures.is_docker_available():
+        logger.warning("Docker is not available. Cannot start PostgreSQL container.")
+        print("\n" + "=" * 50)
+        print("SKIPPING: Docker is not available on this system")
+        print("Tests requiring Docker will need to be skipped")
+        print("=" * 50 + "\n")
+        return 0  # Return success to allow pipeline to continue
+    if fixtures.start_minio_container():
+        print(f"MinIO started at http://localhost:{fixtures.MINIO_PORT}")
+        print(f"Access Key: {fixtures.MINIO_ACCESS_KEY}")
+        return 0
+    return 1
+def stop_minio():
+    """Stop MinIO container"""
+    from . import fixtures
+    if not fixtures.is_docker_available():
+        logger.warning("Docker is not available. Cannot stop MinIO container.")
+        print("\n" + "=" * 50)
+        print("SKIPPING: Docker is not available on this system")
+        print("Tests requiring Docker will need to be skipped")
+        print("=" * 50 + "\n")
+        return 0
+    if fixtures.stop_minio_container():
+        print("MinIO stopped successfully")
+        return 0
+    return 1

test_utils/s3/data_generator.py ADDED Viewed

@@ -0,0 +1,292 @@
+import logging
+import io
+import os
+# Third-party libraries
+import boto3
+from botocore.client import Config
+import polars as pl
+import pyarrow as pa
+from deltalake import write_deltalake
+from pyiceberg.catalog import load_catalog
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+MINIO_HOST = os.environ.get("TEST_MINIO_HOST", "localhost")
+MINIO_PORT = int(os.environ.get("TEST_MINIO_PORT", 9000))
+MINIO_CONSOLE_PORT = int(os.environ.get("TEST_MINIO_CONSOLE_PORT", 9001))
+MINIO_ACCESS_KEY = os.environ.get("TEST_MINIO_ACCESS_KEY", "minioadmin")
+MINIO_SECRET_KEY = os.environ.get("TEST_MINIO_SECRET_KEY", "minioadmin")
+MINIO_CONTAINER_NAME = os.environ.get("TEST_MINIO_CONTAINER", "test-minio-s3")
+MINIO_ENDPOINT_URL = f"http://{MINIO_HOST}:{MINIO_PORT}"
+def _create_single_csv_file(s3_client, df: pl.DataFrame, bucket_name: str):
+    """Creates a single CSV file from a DataFrame and uploads it to S3."""
+    logger.info("Writing single-file CSV...")
+    csv_buffer = io.BytesIO()
+    df.write_csv(csv_buffer)
+    csv_buffer.seek(0)
+    s3_client.put_object(
+        Bucket=bucket_name,
+        Key='single-file-csv/data.csv',
+        Body=csv_buffer.getvalue()
+    )
+def _create_multi_file_csv(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
+    """Creates multiple CSV files from a DataFrame and uploads them to S3."""
+    logger.info(f"Writing {num_files} CSV files...")
+    data_size = len(df)
+    rows_per_file = data_size // num_files
+    for i in range(num_files):
+        sub_df = df.slice(i * rows_per_file, rows_per_file)
+        csv_buffer = io.BytesIO()
+        sub_df.write_csv(csv_buffer)
+        csv_buffer.seek(0)
+        s3_client.put_object(
+            Bucket=bucket_name,
+            Key=f'multi-file-csv/part_{i:02d}.csv',
+            Body=csv_buffer.getvalue()
+        )
+def _create_single_file_json(s3_client, df: pl.DataFrame, bucket_name: str):
+    """Creates a single JSON file from a DataFrame and uploads it to S3."""
+    logger.info("Writing single-file JSON...")
+    json_buffer = io.BytesIO()
+    df.write_ndjson(json_buffer)
+    json_buffer.seek(0)
+    s3_client.put_object(
+        Bucket=bucket_name,
+        Key='single-file-json/data.json',
+        Body=json_buffer.getvalue()
+    )
+def _create_multi_file_json(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
+    """Creates multiple JSON files from a DataFrame and uploads them to S3."""
+    logger.info(f"Writing {num_files} JSON files...")
+    data_size = len(df)
+    rows_per_file = data_size // num_files
+    for i in range(num_files):
+        sub_df = df.slice(i * rows_per_file, rows_per_file)
+        json_buffer = io.BytesIO()
+        sub_df.write_ndjson(json_buffer)
+        json_buffer.seek(0)
+        s3_client.put_object(
+            Bucket=bucket_name,
+            Key=f'multi-file-json/part_{i:02d}.json',
+            Body=json_buffer.getvalue()
+        )
+def _create_single_parquet_file(s3_client, df: pl.DataFrame, bucket_name: str):
+    """Creates a single Parquet file from a DataFrame and uploads it to S3."""
+    logger.info("Writing single-file Parquet...")
+    parquet_buffer = io.BytesIO()
+    df.write_parquet(parquet_buffer)
+    parquet_buffer.seek(0)
+    s3_client.put_object(
+        Bucket=bucket_name,
+        Key='single-file-parquet/data.parquet',
+        Body=parquet_buffer.getvalue()
+    )
+def _create_multi_parquet_file(s3_client, df: pl.DataFrame, bucket_name: str, num_files: int = 10):
+    """Creates multiple Parquet files from a DataFrame and uploads them to S3."""
+    logger.info(f"Writing {num_files} Parquet files...")
+    data_size = len(df)
+    rows_per_file = data_size // num_files
+    for i in range(num_files):
+        sub_df = df.slice(i * rows_per_file, rows_per_file)
+        parquet_buffer = io.BytesIO()
+        sub_df.write_parquet(parquet_buffer)
+        parquet_buffer.seek(0)
+        s3_client.put_object(
+            Bucket=bucket_name,
+            Key=f'multi-file-parquet/part_{i:02d}.parquet',
+            Body=parquet_buffer.getvalue()
+        )
+def _create_delta_lake_table(arrow_table: pa.Table, bucket_name: str, storage_options: dict):
+    """Creates a Delta Lake table from a PyArrow table in S3."""
+    logger.info("Writing Delta Lake table...")
+    delta_table_path = f"s3://{bucket_name}/delta-lake-table"
+    write_deltalake(
+        delta_table_path,
+        arrow_table,
+        mode='overwrite',
+        storage_options=storage_options
+    )
+def _create_iceberg_table(df: pl.DataFrame, bucket_name: str, endpoint_url: str, access_key: str, secret_key: str,
+                          s3_client):
+    """Creates an Apache Iceberg table and FORCES sane metadata pointers."""
+    logger.info("Writing Apache Iceberg table with SANE metadata access...")
+    # Configure the catalog properties for S3 access
+    catalog_props = {
+        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
+        "s3.endpoint": endpoint_url,
+        "s3.access-key-id": access_key,
+        "s3.secret-access-key": secret_key,
+    }
+    # Use the SQL catalog with an in-memory SQLite database for storing metadata pointers
+    catalog = load_catalog(
+        "default",
+        **{
+            "type": "sql",
+            "uri": "sqlite:///:memory:",  # Use an in-memory SQL DB for the catalog
+            "warehouse": f"s3a://{bucket_name}/iceberg_warehouse",
+            **catalog_props,
+        }
+    )
+    table_identifier = ("default_db", "iceberg_table")
+    # Create a namespace (like a schema or database) for the table
+    try:
+        catalog.drop_namespace("default_db")
+    except Exception:
+        pass  # Ignore if namespace doesn't exist
+    catalog.create_namespace("default_db")
+    try:
+        catalog.load_table(table_identifier)
+        catalog.drop_table(table_identifier)
+    except:
+        pass
+    # Create the table schema and object first
+    schema = df.to_arrow().schema
+    table = catalog.create_table(identifier=table_identifier, schema=schema)
+    # Use the simplified write_iceberg method from Polars
+    df.write_iceberg(table, mode='overwrite')
+    # NOW CREATE WHAT SHOULD EXIST BY DEFAULT - SANE METADATA POINTERS
+    # Get the current metadata location from the table
+    current_metadata = table.metadata_location
+    logger.info(f"Original metadata location: {current_metadata}")
+    # Extract just the path part
+    if current_metadata.startswith("s3a://"):
+        current_metadata_key = current_metadata.replace(f"s3a://{bucket_name}/", "")
+    else:
+        current_metadata_key = current_metadata.replace(f"s3://{bucket_name}/", "")
+    # Read the current metadata
+    response = s3_client.get_object(Bucket=bucket_name, Key=current_metadata_key)
+    metadata_content = response['Body'].read()
+    # Get the metadata directory
+    metadata_dir = "/".join(current_metadata_key.split("/")[:-1])
+    # Write it to standardized locations
+    # 1. metadata.json in the metadata folder (this is what pl.scan_iceberg expects)
+    s3_client.put_object(
+        Bucket=bucket_name,
+        Key=f"{metadata_dir}/metadata.json",
+        Body=metadata_content
+    )
+    logger.info(f"Created stable metadata.json at: s3://{bucket_name}/{metadata_dir}/metadata.json")
+    # 2. current.json as an additional pointer
+    s3_client.put_object(
+        Bucket=bucket_name,
+        Key=f"{metadata_dir}/current.json",
+        Body=metadata_content
+    )
+    # 3. VERSION file that contains the current metadata filename
+    current_metadata_filename = current_metadata_key.split("/")[-1]
+    s3_client.put_object(
+        Bucket=bucket_name,
+        Key=f"{metadata_dir}/VERSION",
+        Body=current_metadata_filename.encode()
+    )
+    # 4. version-hint.text (some Iceberg readers look for this)
+    s3_client.put_object(
+        Bucket=bucket_name,
+        Key=f"{metadata_dir}/version-hint.text",
+        Body=current_metadata_filename.encode()
+    )
+    table_base = "iceberg_warehouse/default_db.db/my_iceberg_table"
+    logger.info(f"""
+✅ Iceberg table created with SANE access patterns:
+   - Versioned metadata: s3://{bucket_name}/{current_metadata_key}
+   - Latest metadata: s3://{bucket_name}/{table_base}/metadata/metadata.json
+   - Current pointer: s3://{bucket_name}/{table_base}/metadata/current.json
+   - Version hint: s3://{bucket_name}/{table_base}/metadata/version-hint.text
+   Read with: pl.scan_iceberg('s3://{bucket_name}/{table_base}/metadata/metadata.json').collect()
+""")
+def populate_test_data(endpoint_url: str, access_key: str, secret_key: str, bucket_name: str):
+    """
+    Populates a MinIO bucket with a variety of large-scale test data formats.
+    Args:
+        endpoint_url (str): The S3 endpoint URL for the MinIO instance.
+        access_key (str): The access key for MinIO.
+        secret_key (str): The secret key for MinIO.
+        bucket_name (str): The name of the bucket to populate.
+    """
+    logger.info("🚀 Starting data population...")
+    # --- S3 Client and Storage Options ---
+    s3_client = boto3.client(
+        's3',
+        endpoint_url=endpoint_url,
+        aws_access_key_id=access_key,
+        aws_secret_access_key=secret_key,
+        config=Config(signature_version='s3v4'),
+        region_name='us-east-1'
+    )
+    storage_options = {
+        "AWS_ENDPOINT_URL": endpoint_url,
+        "AWS_ACCESS_KEY_ID": access_key,
+        "AWS_SECRET_ACCESS_KEY": secret_key,
+        "AWS_REGION": "us-east-1",
+        "AWS_ALLOW_HTTP": "true",
+        "AWS_S3_ALLOW_UNSAFE_RENAME": "true"
+    }
+    # --- Data Generation ---
+    data_size = 100_000
+    df = pl.DataFrame({
+        "id": range(1, data_size + 1),
+        "name": [f"user_{i}" for i in range(1, data_size + 1)],
+        "value": [i * 10.5 for i in range(1, data_size + 1)],
+        "category": ["A", "B", "C", "D", "E"] * (data_size // 5)
+    })
+    logger.info(f"Generated a Polars DataFrame with {data_size} rows.")
+    #
+    # # --- Execute Data Population Scenarios ---
+    _create_single_csv_file(s3_client, df, bucket_name)
+    _create_multi_file_csv(s3_client, df, bucket_name)
+    _create_single_file_json(s3_client, df, bucket_name)
+    _create_multi_file_json(s3_client, df, bucket_name)
+    _create_single_parquet_file(s3_client, df, bucket_name)
+    _create_multi_parquet_file(s3_client, df, bucket_name)
+    # Convert to PyArrow table once for Delta and Iceberg
+    arrow_table = df.to_arrow()
+    _create_delta_lake_table(arrow_table, bucket_name, storage_options)
+    _create_iceberg_table(df, bucket_name, endpoint_url, access_key, secret_key, s3_client)
+    logger.info("✅ All test data populated successfully.")
+if __name__ == '__main__':
+    populate_test_data(endpoint_url=MINIO_ENDPOINT_URL,
+                       access_key=MINIO_ACCESS_KEY,
+                       secret_key=MINIO_SECRET_KEY,
+                       bucket_name="test-bucket")

Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl

Potentially problematic release.

Flowfile 0.3.5py3-none-any.whl → 0.3.7py3-none-any.whl