Flowfile 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- build_backends/__init__.py +0 -0
- build_backends/main.py +313 -0
- build_backends/main_prd.py +202 -0
- flowfile/__init__.py +71 -0
- flowfile/__main__.py +24 -0
- flowfile-0.2.2.dist-info/LICENSE +21 -0
- flowfile-0.2.2.dist-info/METADATA +225 -0
- flowfile-0.2.2.dist-info/RECORD +171 -0
- flowfile-0.2.2.dist-info/WHEEL +4 -0
- flowfile-0.2.2.dist-info/entry_points.txt +9 -0
- flowfile_core/__init__.py +13 -0
- flowfile_core/auth/__init__.py +0 -0
- flowfile_core/auth/jwt.py +140 -0
- flowfile_core/auth/models.py +40 -0
- flowfile_core/auth/secrets.py +178 -0
- flowfile_core/configs/__init__.py +35 -0
- flowfile_core/configs/flow_logger.py +433 -0
- flowfile_core/configs/node_store/__init__.py +0 -0
- flowfile_core/configs/node_store/nodes.py +98 -0
- flowfile_core/configs/settings.py +120 -0
- flowfile_core/database/__init__.py +0 -0
- flowfile_core/database/connection.py +51 -0
- flowfile_core/database/init_db.py +45 -0
- flowfile_core/database/models.py +41 -0
- flowfile_core/fileExplorer/__init__.py +0 -0
- flowfile_core/fileExplorer/funcs.py +259 -0
- flowfile_core/fileExplorer/utils.py +53 -0
- flowfile_core/flowfile/FlowfileFlow.py +1403 -0
- flowfile_core/flowfile/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
- flowfile_core/flowfile/analytics/__init__.py +0 -0
- flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
- flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
- flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
- flowfile_core/flowfile/analytics/utils.py +9 -0
- flowfile_core/flowfile/connection_manager/__init__.py +3 -0
- flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
- flowfile_core/flowfile/connection_manager/models.py +10 -0
- flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
- flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
- flowfile_core/flowfile/database_connection_manager/models.py +15 -0
- flowfile_core/flowfile/extensions.py +36 -0
- flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
- flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
- flowfile_core/flowfile/flow_data_engine/types.py +0 -0
- flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
- flowfile_core/flowfile/flow_node/__init__.py +0 -0
- flowfile_core/flowfile/flow_node/flow_node.py +771 -0
- flowfile_core/flowfile/flow_node/models.py +111 -0
- flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
- flowfile_core/flowfile/handler.py +123 -0
- flowfile_core/flowfile/manage/__init__.py +0 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
- flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
- flowfile_core/flowfile/manage/open_flowfile.py +136 -0
- flowfile_core/flowfile/setting_generator/__init__.py +2 -0
- flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
- flowfile_core/flowfile/setting_generator/settings.py +176 -0
- flowfile_core/flowfile/sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
- flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
- flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
- flowfile_core/flowfile/util/__init__.py +0 -0
- flowfile_core/flowfile/util/calculate_layout.py +137 -0
- flowfile_core/flowfile/util/execution_orderer.py +141 -0
- flowfile_core/flowfile/utils.py +106 -0
- flowfile_core/main.py +138 -0
- flowfile_core/routes/__init__.py +0 -0
- flowfile_core/routes/auth.py +34 -0
- flowfile_core/routes/logs.py +163 -0
- flowfile_core/routes/public.py +10 -0
- flowfile_core/routes/routes.py +601 -0
- flowfile_core/routes/secrets.py +85 -0
- flowfile_core/run_lock.py +11 -0
- flowfile_core/schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
- flowfile_core/schemas/defaults.py +9 -0
- flowfile_core/schemas/external_sources/__init__.py +0 -0
- flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
- flowfile_core/schemas/input_schema.py +477 -0
- flowfile_core/schemas/models.py +193 -0
- flowfile_core/schemas/output_model.py +115 -0
- flowfile_core/schemas/schemas.py +106 -0
- flowfile_core/schemas/transform_schema.py +569 -0
- flowfile_core/secrets/__init__.py +0 -0
- flowfile_core/secrets/secrets.py +64 -0
- flowfile_core/utils/__init__.py +0 -0
- flowfile_core/utils/arrow_reader.py +247 -0
- flowfile_core/utils/excel_file_manager.py +18 -0
- flowfile_core/utils/fileManager.py +45 -0
- flowfile_core/utils/fl_executor.py +38 -0
- flowfile_core/utils/utils.py +8 -0
- flowfile_frame/__init__.py +56 -0
- flowfile_frame/__main__.py +12 -0
- flowfile_frame/adapters.py +17 -0
- flowfile_frame/expr.py +1163 -0
- flowfile_frame/flow_frame.py +2093 -0
- flowfile_frame/group_frame.py +199 -0
- flowfile_frame/join.py +75 -0
- flowfile_frame/selectors.py +242 -0
- flowfile_frame/utils.py +184 -0
- flowfile_worker/__init__.py +55 -0
- flowfile_worker/configs.py +95 -0
- flowfile_worker/create/__init__.py +37 -0
- flowfile_worker/create/funcs.py +146 -0
- flowfile_worker/create/models.py +86 -0
- flowfile_worker/create/pl_types.py +35 -0
- flowfile_worker/create/read_excel_tables.py +110 -0
- flowfile_worker/create/utils.py +84 -0
- flowfile_worker/external_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
- flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
- flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- flowfile_worker/external_sources/sql_source/__init__.py +0 -0
- flowfile_worker/external_sources/sql_source/main.py +56 -0
- flowfile_worker/external_sources/sql_source/models.py +72 -0
- flowfile_worker/flow_logger.py +58 -0
- flowfile_worker/funcs.py +327 -0
- flowfile_worker/main.py +108 -0
- flowfile_worker/models.py +95 -0
- flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
- flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
- flowfile_worker/polars_fuzzy_match/models.py +36 -0
- flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
- flowfile_worker/polars_fuzzy_match/process.py +86 -0
- flowfile_worker/polars_fuzzy_match/utils.py +50 -0
- flowfile_worker/process_manager.py +36 -0
- flowfile_worker/routes.py +440 -0
- flowfile_worker/secrets.py +148 -0
- flowfile_worker/spawner.py +187 -0
- flowfile_worker/utils.py +25 -0
- test_utils/__init__.py +3 -0
- test_utils/postgres/__init__.py +1 -0
- test_utils/postgres/commands.py +109 -0
- test_utils/postgres/fixtures.py +417 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Simplified secure storage module for FlowFile worker to read credentials and secrets.
|
|
3
|
+
"""
|
|
4
|
+
from cryptography.fernet import Fernet
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
from pydantic import SecretStr
|
|
10
|
+
from flowfile_worker.configs import TEST_MODE
|
|
11
|
+
|
|
12
|
+
# Set up logging
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SecureStorage:
|
|
17
|
+
"""A secure local storage mechanism for reading secrets using Fernet encryption."""
|
|
18
|
+
|
|
19
|
+
def __init__(self):
|
|
20
|
+
app_data = os.environ.get("APPDATA") or os.path.expanduser("~/.config")
|
|
21
|
+
self.storage_path = Path(app_data) / "flowfile"
|
|
22
|
+
logger.debug(f"Using storage path: {self.storage_path}")
|
|
23
|
+
self.key_path = self.storage_path / ".secret_key"
|
|
24
|
+
|
|
25
|
+
def _get_store_path(self, service_name):
|
|
26
|
+
"""Get the path to the encrypted store file for a service."""
|
|
27
|
+
return self.storage_path / f"{service_name}.json.enc"
|
|
28
|
+
|
|
29
|
+
def _read_store(self, service_name):
|
|
30
|
+
"""Read and decrypt the store file for a service."""
|
|
31
|
+
path = self._get_store_path(service_name)
|
|
32
|
+
if not path.exists():
|
|
33
|
+
return {}
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
with open(self.key_path, "rb") as f:
|
|
37
|
+
key = f.read()
|
|
38
|
+
with open(path, "rb") as f:
|
|
39
|
+
data = f.read()
|
|
40
|
+
|
|
41
|
+
return json.loads(Fernet(key).decrypt(data).decode())
|
|
42
|
+
except Exception as e:
|
|
43
|
+
logger.debug(f"Error reading from encrypted store: {e}")
|
|
44
|
+
return {}
|
|
45
|
+
|
|
46
|
+
def get_password(self, service_name, username):
|
|
47
|
+
"""Retrieve a password from secure storage."""
|
|
48
|
+
store = self._read_store(service_name)
|
|
49
|
+
return store.get(username)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
_storage = SecureStorage()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_password(service_name, username):
|
|
56
|
+
"""
|
|
57
|
+
Retrieve a password from secure storage.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
service_name: The name of the service
|
|
61
|
+
username: The username or key
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
The stored password or None if not found
|
|
65
|
+
"""
|
|
66
|
+
return _storage.get_password(service_name, username)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_docker_secret_key():
|
|
70
|
+
"""
|
|
71
|
+
Get the master key from Docker secret.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
str: The master key if successfully read from Docker secret.
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
RuntimeError: If running in Docker but unable to access the secret.
|
|
78
|
+
"""
|
|
79
|
+
secret_path = "/run/secrets/flowfile_master_key"
|
|
80
|
+
if os.path.exists(secret_path):
|
|
81
|
+
try:
|
|
82
|
+
with open(secret_path, "r") as f:
|
|
83
|
+
return f.read().strip()
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.error(f"Failed to read master key from Docker secret: {e}")
|
|
86
|
+
raise RuntimeError("Failed to read master key from Docker secret")
|
|
87
|
+
else:
|
|
88
|
+
logger.critical("Running in Docker but flowfile_master_key secret is not mounted!")
|
|
89
|
+
raise RuntimeError("Docker secret 'flowfile_master_key' is not mounted")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_master_key() -> str:
|
|
93
|
+
"""
|
|
94
|
+
Get the master encryption key.
|
|
95
|
+
|
|
96
|
+
If in TEST_MODE, returns a test key.
|
|
97
|
+
If running in Docker, retrieves the key from Docker secrets.
|
|
98
|
+
Otherwise, retrieves the key from secure storage.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
str: The master encryption key
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If the master key is not found in storage.
|
|
105
|
+
"""
|
|
106
|
+
# First check for test mode
|
|
107
|
+
if TEST_MODE:
|
|
108
|
+
return b'06t640eu3AG2FmglZS0n0zrEdqadoT7lYDwgSmKyxE4='.decode()
|
|
109
|
+
|
|
110
|
+
# Next check if running in Docker
|
|
111
|
+
if os.environ.get("RUNNING_IN_DOCKER") == "true":
|
|
112
|
+
return get_docker_secret_key()
|
|
113
|
+
|
|
114
|
+
# Otherwise read from local storage
|
|
115
|
+
key = get_password("flowfile", "master_key")
|
|
116
|
+
if not key:
|
|
117
|
+
raise ValueError("Master key not found in storage.")
|
|
118
|
+
return key
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def decrypt_secret(encrypted_value) -> SecretStr:
|
|
122
|
+
"""
|
|
123
|
+
Decrypt an encrypted value using the master key.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
encrypted_value: The encrypted value as a string
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
SecretStr: The decrypted value as a SecretStr
|
|
130
|
+
"""
|
|
131
|
+
key = get_master_key().encode()
|
|
132
|
+
f = Fernet(key)
|
|
133
|
+
return SecretStr(f.decrypt(encrypted_value.encode()).decode())
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def encrypt_secret(secret_value):
|
|
137
|
+
"""
|
|
138
|
+
Encrypt a secret value using the master key.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
secret_value: The secret value to encrypt
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
str: The encrypted value as a string
|
|
145
|
+
"""
|
|
146
|
+
key = get_master_key().encode()
|
|
147
|
+
f = Fernet(key)
|
|
148
|
+
return f.encrypt(secret_value.encode()).decode()
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
from flowfile_worker import status_dict
|
|
2
|
+
from time import sleep
|
|
3
|
+
import gc
|
|
4
|
+
from typing import List, Tuple
|
|
5
|
+
from multiprocessing import Process, Queue
|
|
6
|
+
from flowfile_worker.process_manager import ProcessManager
|
|
7
|
+
from flowfile_worker import models, mp_context, funcs, status_dict_lock
|
|
8
|
+
|
|
9
|
+
# Initialize ProcessManager
|
|
10
|
+
process_manager = ProcessManager()
|
|
11
|
+
|
|
12
|
+
flowfile_node_id_type = int|str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def handle_task(task_id: str, p: Process, progress: mp_context.Value, error_message: mp_context.Array, q: Queue):
|
|
16
|
+
"""
|
|
17
|
+
Monitors and manages a running process task, updating its status and handling completion/errors.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
task_id (str): Unique identifier for the task
|
|
21
|
+
p (Process): The multiprocessing Process object being monitored
|
|
22
|
+
progress (mp_context.Value): Shared value object tracking task progress (0-100)
|
|
23
|
+
error_message (mp_context.Array): Shared array for storing error messages
|
|
24
|
+
q (Queue): Queue for storing task results
|
|
25
|
+
|
|
26
|
+
Notes:
|
|
27
|
+
- Updates task status in status_dict while process is running
|
|
28
|
+
- Handles task cancellation, completion, and error states
|
|
29
|
+
- Cleans up process resources after completion
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
with status_dict_lock:
|
|
33
|
+
status_dict[task_id].status = "Processing"
|
|
34
|
+
|
|
35
|
+
while p.is_alive():
|
|
36
|
+
sleep(1)
|
|
37
|
+
with progress.get_lock():
|
|
38
|
+
current_progress = progress.value
|
|
39
|
+
with status_dict_lock:
|
|
40
|
+
status_dict[task_id].progress = current_progress
|
|
41
|
+
|
|
42
|
+
# Check if the task has been cancelled via status_dict
|
|
43
|
+
if status_dict[task_id].status == "Cancelled":
|
|
44
|
+
p.terminate()
|
|
45
|
+
break
|
|
46
|
+
|
|
47
|
+
if current_progress == -1:
|
|
48
|
+
with status_dict_lock:
|
|
49
|
+
status_dict[task_id].status = "Error"
|
|
50
|
+
with error_message.get_lock():
|
|
51
|
+
status_dict[task_id].error_message = error_message.value.decode().rstrip('\x00')
|
|
52
|
+
break
|
|
53
|
+
|
|
54
|
+
p.join()
|
|
55
|
+
|
|
56
|
+
with status_dict_lock:
|
|
57
|
+
status = status_dict[task_id]
|
|
58
|
+
if status.status != "Cancelled":
|
|
59
|
+
if progress.value == 100:
|
|
60
|
+
status.status = "Completed"
|
|
61
|
+
if not q.empty():
|
|
62
|
+
status.results = q.get()
|
|
63
|
+
elif progress.value != -1:
|
|
64
|
+
status_dict[task_id].status = "Unknown Error"
|
|
65
|
+
|
|
66
|
+
finally:
|
|
67
|
+
if p.is_alive():
|
|
68
|
+
p.terminate()
|
|
69
|
+
p.join()
|
|
70
|
+
process_manager.remove_process(task_id) # Remove from process manager
|
|
71
|
+
del p, progress, error_message
|
|
72
|
+
gc.collect()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def start_process(polars_serializable_object: bytes, task_id: str,
|
|
76
|
+
operation: models.OperationType,
|
|
77
|
+
file_ref: str, flowfile_flow_id: int,
|
|
78
|
+
flowfile_node_id: flowfile_node_id_type,
|
|
79
|
+
kwargs: dict = None) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Starts a new process for handling Polars dataframe operations.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
polars_serializable_object (bytes): Serialized Polars dataframe
|
|
85
|
+
task_id (str): Unique identifier for the task
|
|
86
|
+
operation (models.OperationType): Type of operation to perform
|
|
87
|
+
file_ref (str): Reference to the file being processed
|
|
88
|
+
kwargs (dict, optional): Additional arguments for the operation. Defaults to {}
|
|
89
|
+
flowfile_flow_id: id of the flow that started the process
|
|
90
|
+
flowfile_node_id: id of the node that started the process
|
|
91
|
+
|
|
92
|
+
Notes:
|
|
93
|
+
- Creates shared memory objects for progress tracking and error handling
|
|
94
|
+
- Initializes and starts a new process for the specified operation
|
|
95
|
+
- Delegates to handle_task for process monitoring
|
|
96
|
+
"""
|
|
97
|
+
if kwargs is None:
|
|
98
|
+
kwargs = {}
|
|
99
|
+
process_task = getattr(funcs, operation)
|
|
100
|
+
kwargs['polars_serializable_object'] = polars_serializable_object
|
|
101
|
+
kwargs['progress'] = mp_context.Value('i', 0)
|
|
102
|
+
kwargs['error_message'] = mp_context.Array('c', 1024)
|
|
103
|
+
kwargs['queue'] = Queue(maxsize=1)
|
|
104
|
+
kwargs['file_path'] = file_ref
|
|
105
|
+
kwargs['flowfile_flow_id'] = flowfile_flow_id
|
|
106
|
+
kwargs['flowfile_node_id'] = flowfile_node_id
|
|
107
|
+
|
|
108
|
+
p: Process = mp_context.Process(target=process_task, kwargs=kwargs)
|
|
109
|
+
p.start()
|
|
110
|
+
|
|
111
|
+
process_manager.add_process(task_id, p)
|
|
112
|
+
handle_task(task_id=task_id, p=p, progress=kwargs['progress'], error_message=kwargs['error_message'], q=kwargs['queue'])
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def start_generic_process(func_ref: callable, task_id: str,
|
|
116
|
+
file_ref: str, flowfile_flow_id: int,
|
|
117
|
+
flowfile_node_id: flowfile_node_id_type, kwargs: dict = None) -> None:
|
|
118
|
+
"""
|
|
119
|
+
Starts a new process for handling generic function execution.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
func_ref (callable): Reference to the function to be executed
|
|
123
|
+
task_id (str): Unique identifier for the task
|
|
124
|
+
file_ref (str): Reference to the file being processed
|
|
125
|
+
flowfile_flow_id: id of the flow that started the process
|
|
126
|
+
flowfile_node_id: id of the node that started the process
|
|
127
|
+
kwargs (dict, optional): Additional arguments for the function. Defaults to None.
|
|
128
|
+
|
|
129
|
+
Notes:
|
|
130
|
+
- Creates shared memory objects for progress tracking and error handling
|
|
131
|
+
- Initializes and starts a new process for the generic function
|
|
132
|
+
- Delegates to handle_task for process monitoring
|
|
133
|
+
"""
|
|
134
|
+
kwargs = {} if kwargs is None else kwargs
|
|
135
|
+
kwargs['func'] = func_ref
|
|
136
|
+
kwargs['progress'] = mp_context.Value('i', 0)
|
|
137
|
+
kwargs['error_message'] = mp_context.Array('c', 1024)
|
|
138
|
+
kwargs['queue'] = Queue(maxsize=1)
|
|
139
|
+
kwargs['file_path'] = file_ref
|
|
140
|
+
kwargs['flowfile_flow_id'] = flowfile_flow_id
|
|
141
|
+
kwargs['flowfile_node_id'] = flowfile_node_id
|
|
142
|
+
|
|
143
|
+
process_task = getattr(funcs, 'generic_task')
|
|
144
|
+
p: Process = mp_context.Process(target=process_task, kwargs=kwargs)
|
|
145
|
+
p.start()
|
|
146
|
+
|
|
147
|
+
process_manager.add_process(task_id, p) # Add process to process manager
|
|
148
|
+
handle_task(task_id=task_id, p=p, progress=kwargs['progress'],
|
|
149
|
+
error_message=kwargs['error_message'], q=kwargs['queue'])
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def start_fuzzy_process(left_serializable_object: bytes,
|
|
153
|
+
right_serializable_object: bytes,
|
|
154
|
+
file_ref: str,
|
|
155
|
+
fuzzy_maps: List[models.FuzzyMapping],
|
|
156
|
+
task_id: str,
|
|
157
|
+
flowfile_flow_id: int,
|
|
158
|
+
flowfile_node_id: flowfile_node_id_type) -> None:
|
|
159
|
+
"""
|
|
160
|
+
Starts a new process for performing fuzzy joining operations on two datasets.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
left_serializable_object (bytes): Serialized left dataframe
|
|
164
|
+
right_serializable_object (bytes): Serialized right dataframe
|
|
165
|
+
file_ref (str): Reference to the file being processed
|
|
166
|
+
fuzzy_maps (List[models.FuzzyMapping]): List of fuzzy mapping configurations
|
|
167
|
+
task_id (str): Unique identifier for the task
|
|
168
|
+
flowfile_flow_id: id of the flow that started the process
|
|
169
|
+
flowfile_node_id: id of the node that started the process
|
|
170
|
+
Notes:
|
|
171
|
+
- Creates shared memory objects for progress tracking and error handling
|
|
172
|
+
- Initializes and starts a new process for fuzzy joining operation
|
|
173
|
+
- Delegates to handle_task for process monitoring
|
|
174
|
+
"""
|
|
175
|
+
progress = mp_context.Value('i', 0)
|
|
176
|
+
error_message = mp_context.Array('c', 1024)
|
|
177
|
+
q = Queue(maxsize=1)
|
|
178
|
+
|
|
179
|
+
args: Tuple[bytes, bytes, List[models.FuzzyMapping], mp_context.Array, str, mp_context.Value, Queue, int, flowfile_node_id_type] = \
|
|
180
|
+
(left_serializable_object, right_serializable_object, fuzzy_maps, error_message, file_ref, progress, q,
|
|
181
|
+
flowfile_flow_id, flowfile_node_id)
|
|
182
|
+
|
|
183
|
+
p: Process = mp_context.Process(target=funcs.fuzzy_join_task, args=args)
|
|
184
|
+
p.start()
|
|
185
|
+
|
|
186
|
+
process_manager.add_process(task_id, p) # Add process to process manager
|
|
187
|
+
handle_task(task_id=task_id, p=p, progress=progress, error_message=error_message, q=q)
|
flowfile_worker/utils.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
from polars.exceptions import PanicException
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def collect_lazy_frame(lf: pl.LazyFrame) -> pl.DataFrame:
|
|
7
|
+
try:
|
|
8
|
+
return lf.collect(engine="streaming")
|
|
9
|
+
except PanicException:
|
|
10
|
+
return lf.collect(engine="auto")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class CollectStreamingInfo:
|
|
15
|
+
__slots__ = 'df', 'streaming_collect_available'
|
|
16
|
+
df: pl.DataFrame
|
|
17
|
+
streaming_collect_available: bool
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def collect_lazy_frame_and_get_streaming_info(lf: pl.LazyFrame) -> CollectStreamingInfo:
|
|
21
|
+
try:
|
|
22
|
+
df = lf.collect(engine="streaming")
|
|
23
|
+
return CollectStreamingInfo(df, True)
|
|
24
|
+
except PanicException:
|
|
25
|
+
return CollectStreamingInfo(lf.collect(), False)
|
test_utils/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""PostgreSQL test utilities."""
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for PostgreSQL test database management.
|
|
3
|
+
|
|
4
|
+
This module provides command-line functions that can be called via Poetry scripts
|
|
5
|
+
to start and stop PostgreSQL containers with sample data.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import sys
|
|
10
|
+
import logging
|
|
11
|
+
from . import fixtures
|
|
12
|
+
|
|
13
|
+
# Set up logging
|
|
14
|
+
logging.basicConfig(
|
|
15
|
+
level=logging.INFO,
|
|
16
|
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
17
|
+
datefmt='%Y-%m-%d %H:%M:%S'
|
|
18
|
+
)
|
|
19
|
+
logger = logging.getLogger("postgres_commands")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def start_postgres():
|
|
23
|
+
"""
|
|
24
|
+
Start PostgreSQL container with sample data.
|
|
25
|
+
|
|
26
|
+
This function is the entry point for the 'start_postgres' Poetry script.
|
|
27
|
+
It parses command-line arguments and starts a PostgreSQL container
|
|
28
|
+
with the requested configuration.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Exit code (0 for success, 1 for failure)
|
|
32
|
+
"""
|
|
33
|
+
# Check if Docker is available first
|
|
34
|
+
if not fixtures.is_docker_available():
|
|
35
|
+
logger.warning("Docker is not available. Cannot start PostgreSQL container.")
|
|
36
|
+
print("\n" + "=" * 50)
|
|
37
|
+
print("SKIPPING: Docker is not available on this system")
|
|
38
|
+
print("Tests requiring Docker will need to be skipped")
|
|
39
|
+
print("=" * 50 + "\n")
|
|
40
|
+
return 0 # Return success to allow pipeline to continue
|
|
41
|
+
|
|
42
|
+
parser = argparse.ArgumentParser(description="Start PostgreSQL container with sample data")
|
|
43
|
+
parser.add_argument("--schema", choices=["movies", "stocks"], default="movies",
|
|
44
|
+
help="Sample schema to use (movies or stocks)")
|
|
45
|
+
parser.add_argument("--port", type=int, default=5433,
|
|
46
|
+
help="Port to expose PostgreSQL on (default: 5433)")
|
|
47
|
+
parser.add_argument("--user", default="testuser",
|
|
48
|
+
help="PostgreSQL username (default: testuser)")
|
|
49
|
+
parser.add_argument("--password", default="testpass",
|
|
50
|
+
help="PostgreSQL password (default: testpass)")
|
|
51
|
+
parser.add_argument("--db", default="testdb",
|
|
52
|
+
help="PostgreSQL database name (default: testdb)")
|
|
53
|
+
parser.add_argument("--container-name", default="test-postgres-sample",
|
|
54
|
+
help="Docker container name (default: test-postgres-sample)")
|
|
55
|
+
parser.add_argument("--image-tag", default="test-sample-db",
|
|
56
|
+
help="Docker image tag (default: test-sample-db)")
|
|
57
|
+
|
|
58
|
+
args = parser.parse_args()
|
|
59
|
+
|
|
60
|
+
# Setup and start
|
|
61
|
+
if fixtures.setup_postgres_samples(args.schema, args.port, args.user, args.password, args.db, args.image_tag):
|
|
62
|
+
if fixtures.start_postgres_container(args.container_name, args.port, args.image_tag)[1]:
|
|
63
|
+
fixtures.print_connection_info(
|
|
64
|
+
"localhost", args.port, args.db, args.user, args.password, args.container_name
|
|
65
|
+
)
|
|
66
|
+
return 0
|
|
67
|
+
return 1
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def stop_postgres():
|
|
71
|
+
"""
|
|
72
|
+
Stop PostgreSQL container.
|
|
73
|
+
|
|
74
|
+
This function is the entry point for the 'stop_postgres' Poetry script.
|
|
75
|
+
It parses command-line arguments and stops the specified PostgreSQL container.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Exit code (0 for success, 1 for failure)
|
|
79
|
+
"""
|
|
80
|
+
# Check if Docker is available first
|
|
81
|
+
if not fixtures.is_docker_available():
|
|
82
|
+
logger.warning("Docker is not available. No PostgreSQL container to stop.")
|
|
83
|
+
return 0 # Return success to allow pipeline to continue
|
|
84
|
+
|
|
85
|
+
parser = argparse.ArgumentParser(description="Stop PostgreSQL container")
|
|
86
|
+
parser.add_argument("--container-name", default="test-postgres-sample",
|
|
87
|
+
help="Docker container name (default: test-postgres-sample)")
|
|
88
|
+
parser.add_argument("--timeout", type=int, default=15,
|
|
89
|
+
help="Timeout for graceful container stop in seconds (default: 15)")
|
|
90
|
+
|
|
91
|
+
args = parser.parse_args()
|
|
92
|
+
|
|
93
|
+
if fixtures.stop_postgres_container(args.container_name, args.timeout):
|
|
94
|
+
print(f"Container {args.container_name} stopped successfully")
|
|
95
|
+
return 0
|
|
96
|
+
else:
|
|
97
|
+
print(f"Failed to stop container {args.container_name}")
|
|
98
|
+
return 1
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
# Allow direct script execution for testing
|
|
103
|
+
if len(sys.argv) > 1 and sys.argv[1] == "start":
|
|
104
|
+
sys.exit(start_postgres())
|
|
105
|
+
elif len(sys.argv) > 1 and sys.argv[1] == "stop":
|
|
106
|
+
sys.exit(stop_postgres())
|
|
107
|
+
else:
|
|
108
|
+
print("Usage: python -m test_utils.postgres.commands [start|stop] [options]")
|
|
109
|
+
sys.exit(1)
|