PyPI - Flowfile - Versions diffs - 0.2.2__py3-none-any.whl - Mend

Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show

build_backends/__init__.py +0 -0
build_backends/main.py +313 -0
build_backends/main_prd.py +202 -0
flowfile/__init__.py +71 -0
flowfile/__main__.py +24 -0
flowfile-0.2.2.dist-info/LICENSE +21 -0
flowfile-0.2.2.dist-info/METADATA +225 -0
flowfile-0.2.2.dist-info/RECORD +171 -0
flowfile-0.2.2.dist-info/WHEEL +4 -0
flowfile-0.2.2.dist-info/entry_points.txt +9 -0
flowfile_core/__init__.py +13 -0
flowfile_core/auth/__init__.py +0 -0
flowfile_core/auth/jwt.py +140 -0
flowfile_core/auth/models.py +40 -0
flowfile_core/auth/secrets.py +178 -0
flowfile_core/configs/__init__.py +35 -0
flowfile_core/configs/flow_logger.py +433 -0
flowfile_core/configs/node_store/__init__.py +0 -0
flowfile_core/configs/node_store/nodes.py +98 -0
flowfile_core/configs/settings.py +120 -0
flowfile_core/database/__init__.py +0 -0
flowfile_core/database/connection.py +51 -0
flowfile_core/database/init_db.py +45 -0
flowfile_core/database/models.py +41 -0
flowfile_core/fileExplorer/__init__.py +0 -0
flowfile_core/fileExplorer/funcs.py +259 -0
flowfile_core/fileExplorer/utils.py +53 -0
flowfile_core/flowfile/FlowfileFlow.py +1403 -0
flowfile_core/flowfile/__init__.py +0 -0
flowfile_core/flowfile/_extensions/__init__.py +0 -0
flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
flowfile_core/flowfile/analytics/__init__.py +0 -0
flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
flowfile_core/flowfile/analytics/utils.py +9 -0
flowfile_core/flowfile/connection_manager/__init__.py +3 -0
flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
flowfile_core/flowfile/connection_manager/models.py +10 -0
flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
flowfile_core/flowfile/database_connection_manager/models.py +15 -0
flowfile_core/flowfile/extensions.py +36 -0
flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
flowfile_core/flowfile/flow_data_engine/types.py +0 -0
flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
flowfile_core/flowfile/flow_node/__init__.py +0 -0
flowfile_core/flowfile/flow_node/flow_node.py +771 -0
flowfile_core/flowfile/flow_node/models.py +111 -0
flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
flowfile_core/flowfile/handler.py +123 -0
flowfile_core/flowfile/manage/__init__.py +0 -0
flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
flowfile_core/flowfile/manage/open_flowfile.py +136 -0
flowfile_core/flowfile/setting_generator/__init__.py +2 -0
flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
flowfile_core/flowfile/setting_generator/settings.py +176 -0
flowfile_core/flowfile/sources/__init__.py +0 -0
flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
flowfile_core/flowfile/util/__init__.py +0 -0
flowfile_core/flowfile/util/calculate_layout.py +137 -0
flowfile_core/flowfile/util/execution_orderer.py +141 -0
flowfile_core/flowfile/utils.py +106 -0
flowfile_core/main.py +138 -0
flowfile_core/routes/__init__.py +0 -0
flowfile_core/routes/auth.py +34 -0
flowfile_core/routes/logs.py +163 -0
flowfile_core/routes/public.py +10 -0
flowfile_core/routes/routes.py +601 -0
flowfile_core/routes/secrets.py +85 -0
flowfile_core/run_lock.py +11 -0
flowfile_core/schemas/__init__.py +0 -0
flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
flowfile_core/schemas/defaults.py +9 -0
flowfile_core/schemas/external_sources/__init__.py +0 -0
flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
flowfile_core/schemas/input_schema.py +477 -0
flowfile_core/schemas/models.py +193 -0
flowfile_core/schemas/output_model.py +115 -0
flowfile_core/schemas/schemas.py +106 -0
flowfile_core/schemas/transform_schema.py +569 -0
flowfile_core/secrets/__init__.py +0 -0
flowfile_core/secrets/secrets.py +64 -0
flowfile_core/utils/__init__.py +0 -0
flowfile_core/utils/arrow_reader.py +247 -0
flowfile_core/utils/excel_file_manager.py +18 -0
flowfile_core/utils/fileManager.py +45 -0
flowfile_core/utils/fl_executor.py +38 -0
flowfile_core/utils/utils.py +8 -0
flowfile_frame/__init__.py +56 -0
flowfile_frame/__main__.py +12 -0
flowfile_frame/adapters.py +17 -0
flowfile_frame/expr.py +1163 -0
flowfile_frame/flow_frame.py +2093 -0
flowfile_frame/group_frame.py +199 -0
flowfile_frame/join.py +75 -0
flowfile_frame/selectors.py +242 -0
flowfile_frame/utils.py +184 -0
flowfile_worker/__init__.py +55 -0
flowfile_worker/configs.py +95 -0
flowfile_worker/create/__init__.py +37 -0
flowfile_worker/create/funcs.py +146 -0
flowfile_worker/create/models.py +86 -0
flowfile_worker/create/pl_types.py +35 -0
flowfile_worker/create/read_excel_tables.py +110 -0
flowfile_worker/create/utils.py +84 -0
flowfile_worker/external_sources/__init__.py +0 -0
flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
flowfile_worker/external_sources/sql_source/__init__.py +0 -0
flowfile_worker/external_sources/sql_source/main.py +56 -0
flowfile_worker/external_sources/sql_source/models.py +72 -0
flowfile_worker/flow_logger.py +58 -0
flowfile_worker/funcs.py +327 -0
flowfile_worker/main.py +108 -0
flowfile_worker/models.py +95 -0
flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
flowfile_worker/polars_fuzzy_match/models.py +36 -0
flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
flowfile_worker/polars_fuzzy_match/process.py +86 -0
flowfile_worker/polars_fuzzy_match/utils.py +50 -0
flowfile_worker/process_manager.py +36 -0
flowfile_worker/routes.py +440 -0
flowfile_worker/secrets.py +148 -0
flowfile_worker/spawner.py +187 -0
flowfile_worker/utils.py +25 -0
test_utils/__init__.py +3 -0
test_utils/postgres/__init__.py +1 -0
test_utils/postgres/commands.py +109 -0
test_utils/postgres/fixtures.py +417 -0

flowfile_frame/utils.py ADDED Viewed

@@ -0,0 +1,184 @@
+import uuid
+import time
+import os
+import requests
+import subprocess
+from pathlib import Path
+from typing import Iterable, Any, List, Optional
+from flowfile_core.flowfile.FlowfileFlow import FlowGraph
+from flowfile_core.schemas import schemas
+from tempfile import TemporaryDirectory
+def _is_iterable(obj: Any) -> bool:
+    # Avoid treating strings as iterables in this context
+    return isinstance(obj, Iterable) and not isinstance(obj, (str, bytes))
+def _parse_inputs_as_iterable(
+        inputs: tuple[Any, ...] | tuple[Iterable[Any]],
+) -> List[Any]:
+    if not inputs:
+        return []
+    # Treat elements of a single iterable as separate inputs
+    if len(inputs) == 1 and _is_iterable(inputs[0]):
+        return list(inputs[0])
+    return list(inputs)
+def _generate_id() -> int:
+    """Generate a simple unique ID for nodes."""
+    return int(uuid.uuid4().int % 100000)
+def create_etl_graph() -> FlowGraph:
+    flow_id = _generate_id()
+    flow_settings = schemas.FlowSettings(
+        flow_id=flow_id,
+        name=f"Flow_{flow_id}",
+        path=f"flow_{flow_id}"
+    )
+    flow_graph = FlowGraph(flow_id=flow_id, flow_settings=flow_settings)
+    flow_graph.flow_settings.execution_location = 'local'  # always create a local frame so that the run time does not attempt to use the flowfile_worker process
+    return flow_graph
+def is_flowfile_running() -> bool:
+    """Check if the Flowfile application is running by testing its API endpoint."""
+    try:
+        response = requests.get("http://0.0.0.0:63578/docs", timeout=2)
+        return response.status_code == 200
+    except (requests.ConnectionError, requests.Timeout):
+        return False
+def start_flowfile_application() -> bool:
+    """Start the Flowfile application on macOS."""
+    try:
+        # Attempt to start the Flowfile application
+        subprocess.Popen(['open', '-a', 'Flowfile'],
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE)
+        # Wait for the application to start up (max 10 seconds)
+        start_time = time.time()
+        while time.time() - start_time < 10:
+            if is_flowfile_running():
+                return True
+            time.sleep(0.5)  # Check every half second
+        # If we get here, the app didn't start in time
+        return False
+    except Exception as e:
+        print(f"Error starting Flowfile application: {e}")
+        return False
+def get_auth_token() -> Optional[str]:
+    """Get an authentication token from the Flowfile API."""
+    try:
+        response = requests.post(
+            "http://0.0.0.0:63578/auth/token",
+            json={},  # Empty body as specified
+            timeout=5
+        )
+        if response.status_code == 200:
+            token_data = response.json()
+            return token_data.get("access_token")
+        else:
+            print(f"Failed to get auth token: {response.status_code} - {response.text}")
+            return None
+    except Exception as e:
+        print(f"Error getting auth token: {e}")
+        return None
+def import_flow_to_editor(flow_path: str, auth_token: str) -> Optional[int]:
+    """Import the flow into the Flowfile editor using the API endpoint."""
+    try:
+        flow_path = Path(flow_path).resolve()  # Get absolute path
+        if not flow_path.exists():
+            print(f"Flow file not found: {flow_path}")
+            return None
+        # Set authorization header with the token
+        headers = {"Authorization": f"Bearer {auth_token}"}
+        # Make a GET request to the import endpoint
+        response = requests.get(
+            "http://0.0.0.0:63578/import_flow/",
+            params={"flow_path": str(flow_path)},
+            headers=headers,
+            timeout=10
+        )
+        if response.status_code == 200:
+            flow_id = response.json()
+            print(f"Flow imported successfully with ID: {flow_id}")
+            return flow_id
+        else:
+            print(f"Failed to import flow: {response.status_code} - {response.text}")
+            return None
+    except Exception as e:
+        print(f"Error importing flow: {e}")
+        return None
+def open_graph_in_editor(etl_graph: FlowGraph, storage_location: str = None) -> bool:
+    """
+    Save the ETL graph and open it in the Flowfile editor.
+    Parameters:
+    -----------
+    etl_graph : FlowGraph
+        The graph to save and open
+    storage_location : str, optional
+        Where to save the flowfile. If None, a default name is used.
+    Returns:
+    --------
+    bool
+        True if the graph was successfully opened in the editor, False otherwise
+    """
+    # Create a temporary directory if needed
+    temp_dir = None
+    if storage_location is None:
+        temp_dir = TemporaryDirectory()
+        storage_location = os.path.join(temp_dir.name, 'temp_flow.flowfile')
+    else:
+        # Ensure path is absolute
+        storage_location = os.path.abspath(storage_location)
+    etl_graph.apply_layout()
+    etl_graph.save_flow(storage_location)
+    print(f"Flow saved to: {storage_location}")
+    # Check if Flowfile is running, and start it if not
+    if not is_flowfile_running():
+        print("Flowfile application is not running. Starting it...")
+        if not start_flowfile_application():
+            print("Failed to start Flowfile application")
+            if temp_dir:
+                temp_dir.cleanup()
+            return False
+        print("Flowfile application started successfully")
+    # Get authentication token
+    auth_token = get_auth_token()
+    if not auth_token:
+        print("Failed to authenticate with Flowfile API")
+        if temp_dir:
+            temp_dir.cleanup()
+        return False
+    # Import the flow into the editor
+    flow_id = import_flow_to_editor(storage_location, auth_token)
+    # Clean up temporary directory if we created one
+    if temp_dir:
+        temp_dir.cleanup()
+    return flow_id is not None

flowfile_worker/__init__.py ADDED Viewed

@@ -0,0 +1,55 @@
+from typing import Dict
+import tempfile
+import threading
+import multiprocessing
+import os
+import shutil
+multiprocessing.set_start_method('spawn', force=True)
+from multiprocessing import get_context
+from flowfile_worker.models import Status
+mp_context = get_context("spawn")
+status_dict: Dict[str, Status] = dict()
+process_dict = dict()
+status_dict_lock = threading.Lock()
+process_dict_lock = threading.Lock()
+class SharedTempDirectory:
+    """A class that mimics tempfile.TemporaryDirectory but uses a fixed directory"""
+    def __init__(self, dir_path):
+        self._path = dir_path
+        os.makedirs(self._path, exist_ok=True)
+    @property
+    def name(self):
+        return self._path
+    def cleanup(self):
+        """Remove all contents of the temp directory"""
+        try:
+            shutil.rmtree(self._path)
+            os.makedirs(self._path, exist_ok=True)
+            print(f"Cleaned up temporary directory: {self._path}")
+        except Exception as e:
+            print(f"Error during cleanup: {e}")
+    def __enter__(self):
+        return self.name
+    def __exit__(self, exc, value, tb):
+        self.cleanup()
+CACHE_EXPIRATION_TIME = 24 * 60 * 60
+TEMP_DIR = os.getenv('TEMP_DIR')
+if TEMP_DIR:
+    CACHE_DIR = SharedTempDirectory(TEMP_DIR)
+else:
+    CACHE_DIR = tempfile.TemporaryDirectory()
+PROCESS_MEMORY_USAGE: Dict[str, float] = dict()

flowfile_worker/configs.py ADDED Viewed

@@ -0,0 +1,95 @@
+# flowfile_worker.configs
+import logging
+import platform
+import argparse
+import os
+from connectorx import __version__
+# Configure logging
+logging.basicConfig(format='%(asctime)s: %(message)s')
+logger = logging.getLogger('FlowfileWorker')
+logger.setLevel(logging.INFO)
+# Constants for worker and core configuration
+DEFAULT_SERVICE_HOST = "0.0.0.0" if platform.system() != "Windows" else "127.0.0.1"
+DEFAULT_SERVICE_PORT = 63579
+DEFAULT_CORE_HOST = "0.0.0.0" if platform.system() != "Windows" else "127.0.0.1"
+DEFAULT_CORE_PORT = 63578
+TEST_MODE = True if 'TEST_MODE' in os.environ else False
+def parse_args():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(description="Flowfile Worker Server")
+    parser.add_argument(
+        "--host", type=str, default=DEFAULT_SERVICE_HOST, help="Host to bind worker to"
+    )
+    parser.add_argument(
+        "--port", type=int, default=DEFAULT_SERVICE_PORT, help="Port to bind worker to"
+    )
+    parser.add_argument(
+        "--core-host",
+        type=str,
+        default=DEFAULT_CORE_HOST,
+        help="Host of the core service",
+    )
+    parser.add_argument(
+        "--core-port",
+        type=int,
+        default=DEFAULT_CORE_PORT,
+        help="Port of the core service",
+    )
+    # Use known_args to handle PyInstaller's extra args
+    args = parser.parse_known_args()[0]
+    # Validate arguments
+    if args.port < 1 or args.port > 65535:
+        raise ValueError(
+            f"Invalid port number: {args.port}. Port must be between 1 and 65535."
+        )
+    if args.core_port < 1 or args.core_port > 65535:
+        raise ValueError(
+            f"Invalid core port number: {args.core_port}. Port must be between 1 and 65535."
+        )
+    # Check if hosts are valid (basic check)
+    if not args.host:
+        raise ValueError("Worker host cannot be empty")
+    if not args.core_host:
+        raise ValueError("Core host cannot be empty")
+    return args
+def get_core_url(host, port):
+    """
+    Get the core URL based on provided host and port
+    Args:
+        host: Core service host
+        port: Core service port
+    """
+    return f"http://{host}:{port}"
+# Parse arguments - defaults are already set in the argument parser
+args = parse_args()
+# These variables will already use defaults from argparse if not provided
+SERVICE_HOST = args.host
+SERVICE_PORT = args.port
+CORE_HOST = args.core_host
+CORE_PORT = args.core_port
+# Generate the core URI
+FLOWFILE_CORE_URI = get_core_url(CORE_HOST, CORE_PORT)
+logger.info(f"ConnectorX version: {__version__}")
+# Log configuration
+logger.info(f"Worker configured at {SERVICE_HOST}:{SERVICE_PORT}")
+logger.info(f"Core service configured at {FLOWFILE_CORE_URI}")

flowfile_worker/create/__init__.py ADDED Viewed

@@ -0,0 +1,37 @@
+from flowfile_worker.create.models import (ReceivedCsvTable, ReceivedParquetTable, ReceivedExcelTable,
+                                           ReceivedJsonTable)
+from flowfile_worker.create.funcs import (create_from_path_csv, create_from_path_parquet, create_from_path_excel,
+                                          create_from_path_json)
+from typing import Dict, Literal
+ReceivedTableCollection = ReceivedCsvTable | ReceivedParquetTable | ReceivedJsonTable | ReceivedExcelTable
+FileType = Literal['csv', 'parquet', 'json', 'excel']
+def received_table_parser(received_table_raw: Dict, file_type: FileType) -> ReceivedTableCollection:
+    match file_type:
+        case 'csv':
+            received_table = ReceivedCsvTable.model_validate(received_table_raw)
+        case 'parquet':
+            received_table = ReceivedParquetTable.model_validate(received_table_raw)
+        case 'excel':
+            received_table = ReceivedExcelTable.model_validate(received_table_raw)
+        case 'json':
+            return ReceivedJsonTable.model_validate(received_table_raw)
+        case _:
+            raise ValueError(f'Unsupported file type: {file_type}')
+    return received_table
+def table_creator_factory_method(file_type: Literal['csv', 'parquet', 'json', 'excel']) -> callable:
+    match file_type:
+        case 'csv':
+            return create_from_path_csv
+        case 'parquet':
+            return create_from_path_parquet
+        case 'excel':
+            return create_from_path_excel
+        case 'json':
+            return create_from_path_json
+        case _:
+            raise ValueError(f'Unsupported file type: {file_type}')

flowfile_worker/create/funcs.py ADDED Viewed

@@ -0,0 +1,146 @@
+import polars as pl
+import os
+from flowfile_worker.create.models import ReceivedCsvTable, ReceivedParquetTable, ReceivedExcelTable
+from flowfile_worker.create.utils import create_fake_data
+from flowfile_worker.create.read_excel_tables import df_from_openpyxl, df_from_calamine_xlsx
+def create_from_path_json(received_table: ReceivedCsvTable):
+    f = received_table.abs_file_path
+    gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
+    low_mem = gbs_to_load > 10
+    if received_table.encoding.upper() == 'UTF8' or received_table.encoding.upper() == 'UTF-8':
+        try:
+            df = pl.scan_csv(f,
+                               low_memory=low_mem,
+                               try_parse_dates=True,
+                               separator=received_table.delimiter,
+                               has_header=received_table.has_headers,
+                               skip_rows=received_table.starting_from_line,
+                               encoding='utf8',
+                               infer_schema_length=received_table.infer_schema_length)
+            df.head(1).collect()
+            return df
+        except:
+            try:
+                df = pl.scan_csv(f, low_memory=low_mem,
+                                   separator=received_table.delimiter,
+                                   has_header=received_table.has_headers,
+                                   skip_rows=received_table.starting_from_line,
+                                   encoding='utf8-lossy',
+                                   ignore_errors=True)
+                return df
+            except:
+                df = pl.scan_csv(f, low_memory=low_mem,
+                                   separator=received_table.delimiter,
+                                   has_header=received_table.has_headers,
+                                   skip_rows=received_table.starting_from_line,
+                                   encoding='utf8',
+                                   ignore_errors=True)
+                return df
+    else:
+        df = pl.read_csv(f, low_memory=low_mem,
+                           separator=received_table.delimiter,
+                           has_header=received_table.has_headers,
+                           skip_rows=received_table.starting_from_line,
+                           encoding=received_table.encoding,
+                           ignore_errors=True)
+        return df
+def create_from_path_csv(received_table: ReceivedCsvTable) -> pl.DataFrame:
+    f = received_table.abs_file_path
+    gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
+    low_mem = gbs_to_load > 10
+    if received_table.encoding.upper() == 'UTF8' or received_table.encoding.upper() == 'UTF-8':
+        try:
+            df = pl.scan_csv(f,
+                               low_memory=low_mem,
+                               try_parse_dates=True,
+                               separator=received_table.delimiter,
+                               has_header=received_table.has_headers,
+                               skip_rows=received_table.starting_from_line,
+                               encoding='utf8',
+                               infer_schema_length=received_table.infer_schema_length)
+            df.head(1).collect()
+            return df
+        except:
+            try:
+                df = pl.scan_csv(f, low_memory=low_mem,
+                                   separator=received_table.delimiter,
+                                   has_header=received_table.has_headers,
+                                   skip_rows=received_table.starting_from_line,
+                                   encoding='utf8-lossy',
+                                   ignore_errors=True)
+                return df
+            except:
+                df = pl.scan_csv(f, low_memory=low_mem,
+                                   separator=received_table.delimiter,
+                                   has_header=received_table.has_headers,
+                                   skip_rows=received_table.starting_from_line,
+                                   encoding='utf8',
+                                   ignore_errors=True)
+                return df
+    else:
+        df = pl.read_csv(f,
+                           low_memory=low_mem,
+                           separator=received_table.delimiter,
+                           has_header=received_table.has_headers,
+                           skip_rows=received_table.starting_from_line,
+                           encoding=received_table.encoding,
+                           ignore_errors=True)
+        return df
+def create_random(number_of_records: int = 1000) -> pl.LazyFrame:
+    return create_fake_data(number_of_records).lazy()
+def create_from_path_parquet(received_table: ReceivedParquetTable):
+    low_mem = (os.path.getsize(received_table.abs_file_path) / 1024 / 1000 / 1000) > 2
+    return pl.scan_parquet(source=received_table.abs_file_path, low_memory=low_mem)
+def create_from_path_excel(received_table: ReceivedExcelTable):
+    if received_table.type_inference:
+        engine = 'openpyxl'
+    elif received_table.start_row > 0 and received_table.start_column == 0:
+        engine = 'calamine' if received_table.has_headers else 'xlsx2csv'
+    elif received_table.start_column > 0 or received_table.start_row > 0:
+        engine = 'openpyxl'
+    else:
+        engine = 'calamine'
+    sheet_name = received_table.sheet_name
+    if engine == 'calamine':
+        df = df_from_calamine_xlsx(file_path=received_table.abs_file_path, sheet_name=sheet_name,
+                                   start_row=received_table.start_row, end_row=received_table.end_row)
+        if received_table.end_column > 0:
+            end_col_index = received_table.end_column
+            cols_to_select = [df.columns[i] for i in range(received_table.start_column, end_col_index)]
+            df = df.select(cols_to_select)
+    elif engine == 'xlsx2csv':
+        csv_options = {'has_header': received_table.has_headers, 'skip_rows': received_table.start_row}
+        df = pl.read_excel(source=received_table.abs_file_path,
+                           read_options=csv_options,
+                           engine='xlsx2csv',
+                           sheet_name=received_table.sheet_name)
+        end_col_index = received_table.end_column if received_table.end_column > 0 else len(df.columns)
+        cols_to_select = [df.columns[i] for i in range(received_table.start_column, end_col_index)]
+        df = df.select(cols_to_select)
+        if 0 < received_table.end_row < len(df):
+            df = df.head(received_table.end_row)
+    else:
+        max_col = received_table.end_column if received_table.end_column > 0 else None
+        max_row = received_table.end_row + 1 if received_table.end_row > 0 else None
+        df = df_from_openpyxl(file_path=received_table.abs_file_path,
+                              sheet_name=received_table.sheet_name,
+                              min_row=received_table.start_row + 1,
+                              min_col=received_table.start_column + 1,
+                              max_row=max_row,
+                              max_col=max_col, has_headers=received_table.has_headers)
+    return df

flowfile_worker/create/models.py ADDED Viewed

@@ -0,0 +1,86 @@
+from pydantic import BaseModel, Field, model_validator
+from typing import List, Optional
+import os
+from pathlib import Path
+class MinimalFieldInfo(BaseModel):
+    name: str
+    data_type: str
+class ReceivedTableBase(BaseModel):
+    id: Optional[int] = None
+    name: str
+    path: str
+    directory: Optional[str] = None
+    analysis_file_available: Optional[bool] = False
+    status: Optional[str] = None
+    file_type: Optional[str] = None
+    fields: List[MinimalFieldInfo] = Field(default_factory=list)
+    abs_file_path: Optional[str] = None
+    @classmethod
+    def create_from_path(cls, path: str):
+        filename = os.path.basename(path)
+        return cls(name=filename, path=path)
+    @property
+    def file_path(self) -> str:
+        if self.name not in self.path:
+            return os.path.join(self.path, self.name)
+        return self.path
+    @model_validator(mode="after")
+    def set_abs_file_path(cls, values):
+        abs_file_path = getattr(values, "abs_file_path", None)
+        if abs_file_path is None:
+            path = getattr(values, "path", None)
+            if not path:
+                raise ValueError("Field 'path' is required to compute abs_file_path")
+            setattr(values, "abs_file_path", str(Path(path).absolute()))
+        return values
+class ReceivedCsvTable(ReceivedTableBase):
+    file_type: Optional[str] = 'csv'
+    reference: Optional[str] = ''
+    starting_from_line: Optional[int] = 0
+    delimiter: Optional[str] = ','
+    has_headers: Optional[bool] = True
+    encoding: Optional[str] = 'utf-8'
+    parquet_ref: Optional[str] = None
+    row_delimiter: Optional[str] = '\n'
+    quote_char: Optional[str] = '"'
+    infer_schema_length: Optional[int] = 10_000
+    truncate_ragged_lines: Optional[bool] = False
+    ignore_errors: Optional[bool] = False
+class ReceivedJsonTable(ReceivedCsvTable):
+    pass
+class ReceivedParquetTable(ReceivedTableBase):
+    file_type: Optional[str] = 'parquet'
+class ReceivedExcelTable(ReceivedTableBase):
+    sheet_name: Optional[str] = None
+    start_row: Optional[int] = 0  # optional
+    start_column: Optional[int] = 0  # optional
+    end_row: Optional[int] = 0  # optional
+    end_column: Optional[int] = 0  # optional
+    has_headers: Optional[bool] = True  # optional
+    type_inference: Optional[bool] = False  # optional
+    def validate_range_values(self):
+        # Validate that start and end rows/columns are non-negative integers
+        for attribute in [self.start_row, self.start_column, self.end_row, self.end_column]:
+            if not isinstance(attribute, int) or attribute < 0:
+                raise ValueError("Row and column indices must be non-negative integers")
+        # Validate that start is before end if end is specified (non-zero)
+        if (0 < self.end_row < self.start_row) or \
+                (0 < self.end_column < self.start_column):
+            raise ValueError("Start row/column must not be greater than end row/column if specified")

flowfile_worker/create/pl_types.py ADDED Viewed

@@ -0,0 +1,35 @@
+import polars as pl
+dtype_to_pl = {
+    'int': pl.Int64,
+    'integer': pl.Int64,
+    'char': pl.String,
+    'fixed decimal': pl.Float32,
+    'double': pl.Float64,
+    'float': pl.Float64,
+    'bool': pl.Boolean,
+    'byte': pl.UInt8,
+    'bit': pl.Binary,
+    'date': pl.Date,
+    'datetime': pl.Datetime,
+    'string': pl.String,
+    'str': pl.String,
+    'time': pl.Time,
+}
+dtype_to_pl_str = {k: v.__name__ for k, v in dtype_to_pl.items()}
+def type_to_polars(dtype: str):
+    pl_datetype = dtype_to_pl.get(dtype.lower())
+    if pl_datetype is not None:
+        return pl_datetype
+    elif hasattr(pl, dtype):
+        return getattr(pl, dtype)
+    else:
+        return pl.String
+def type_to_polars_str(dtype: str) -> pl.DataType:
+    return type_to_polars(dtype)()