Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,148 @@
1
+ """
2
+ Simplified secure storage module for FlowFile worker to read credentials and secrets.
3
+ """
4
+ from cryptography.fernet import Fernet
5
+ import os
6
+ from pathlib import Path
7
+ import json
8
+ import logging
9
+ from pydantic import SecretStr
10
+ from flowfile_worker.configs import TEST_MODE
11
+
12
+ # Set up logging
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class SecureStorage:
17
+ """A secure local storage mechanism for reading secrets using Fernet encryption."""
18
+
19
+ def __init__(self):
20
+ app_data = os.environ.get("APPDATA") or os.path.expanduser("~/.config")
21
+ self.storage_path = Path(app_data) / "flowfile"
22
+ logger.debug(f"Using storage path: {self.storage_path}")
23
+ self.key_path = self.storage_path / ".secret_key"
24
+
25
+ def _get_store_path(self, service_name):
26
+ """Get the path to the encrypted store file for a service."""
27
+ return self.storage_path / f"{service_name}.json.enc"
28
+
29
+ def _read_store(self, service_name):
30
+ """Read and decrypt the store file for a service."""
31
+ path = self._get_store_path(service_name)
32
+ if not path.exists():
33
+ return {}
34
+
35
+ try:
36
+ with open(self.key_path, "rb") as f:
37
+ key = f.read()
38
+ with open(path, "rb") as f:
39
+ data = f.read()
40
+
41
+ return json.loads(Fernet(key).decrypt(data).decode())
42
+ except Exception as e:
43
+ logger.debug(f"Error reading from encrypted store: {e}")
44
+ return {}
45
+
46
+ def get_password(self, service_name, username):
47
+ """Retrieve a password from secure storage."""
48
+ store = self._read_store(service_name)
49
+ return store.get(username)
50
+
51
+
52
+ _storage = SecureStorage()
53
+
54
+
55
+ def get_password(service_name, username):
56
+ """
57
+ Retrieve a password from secure storage.
58
+
59
+ Args:
60
+ service_name: The name of the service
61
+ username: The username or key
62
+
63
+ Returns:
64
+ The stored password or None if not found
65
+ """
66
+ return _storage.get_password(service_name, username)
67
+
68
+
69
+ def get_docker_secret_key():
70
+ """
71
+ Get the master key from Docker secret.
72
+
73
+ Returns:
74
+ str: The master key if successfully read from Docker secret.
75
+
76
+ Raises:
77
+ RuntimeError: If running in Docker but unable to access the secret.
78
+ """
79
+ secret_path = "/run/secrets/flowfile_master_key"
80
+ if os.path.exists(secret_path):
81
+ try:
82
+ with open(secret_path, "r") as f:
83
+ return f.read().strip()
84
+ except Exception as e:
85
+ logger.error(f"Failed to read master key from Docker secret: {e}")
86
+ raise RuntimeError("Failed to read master key from Docker secret")
87
+ else:
88
+ logger.critical("Running in Docker but flowfile_master_key secret is not mounted!")
89
+ raise RuntimeError("Docker secret 'flowfile_master_key' is not mounted")
90
+
91
+
92
+ def get_master_key() -> str:
93
+ """
94
+ Get the master encryption key.
95
+
96
+ If in TEST_MODE, returns a test key.
97
+ If running in Docker, retrieves the key from Docker secrets.
98
+ Otherwise, retrieves the key from secure storage.
99
+
100
+ Returns:
101
+ str: The master encryption key
102
+
103
+ Raises:
104
+ ValueError: If the master key is not found in storage.
105
+ """
106
+ # First check for test mode
107
+ if TEST_MODE:
108
+ return b'06t640eu3AG2FmglZS0n0zrEdqadoT7lYDwgSmKyxE4='.decode()
109
+
110
+ # Next check if running in Docker
111
+ if os.environ.get("RUNNING_IN_DOCKER") == "true":
112
+ return get_docker_secret_key()
113
+
114
+ # Otherwise read from local storage
115
+ key = get_password("flowfile", "master_key")
116
+ if not key:
117
+ raise ValueError("Master key not found in storage.")
118
+ return key
119
+
120
+
121
+ def decrypt_secret(encrypted_value) -> SecretStr:
122
+ """
123
+ Decrypt an encrypted value using the master key.
124
+
125
+ Args:
126
+ encrypted_value: The encrypted value as a string
127
+
128
+ Returns:
129
+ SecretStr: The decrypted value as a SecretStr
130
+ """
131
+ key = get_master_key().encode()
132
+ f = Fernet(key)
133
+ return SecretStr(f.decrypt(encrypted_value.encode()).decode())
134
+
135
+
136
+ def encrypt_secret(secret_value):
137
+ """
138
+ Encrypt a secret value using the master key.
139
+
140
+ Args:
141
+ secret_value: The secret value to encrypt
142
+
143
+ Returns:
144
+ str: The encrypted value as a string
145
+ """
146
+ key = get_master_key().encode()
147
+ f = Fernet(key)
148
+ return f.encrypt(secret_value.encode()).decode()
@@ -0,0 +1,187 @@
1
+ from flowfile_worker import status_dict
2
+ from time import sleep
3
+ import gc
4
+ from typing import List, Tuple
5
+ from multiprocessing import Process, Queue
6
+ from flowfile_worker.process_manager import ProcessManager
7
+ from flowfile_worker import models, mp_context, funcs, status_dict_lock
8
+
9
+ # Initialize ProcessManager
10
+ process_manager = ProcessManager()
11
+
12
+ flowfile_node_id_type = int|str
13
+
14
+
15
+ def handle_task(task_id: str, p: Process, progress: mp_context.Value, error_message: mp_context.Array, q: Queue):
16
+ """
17
+ Monitors and manages a running process task, updating its status and handling completion/errors.
18
+
19
+ Args:
20
+ task_id (str): Unique identifier for the task
21
+ p (Process): The multiprocessing Process object being monitored
22
+ progress (mp_context.Value): Shared value object tracking task progress (0-100)
23
+ error_message (mp_context.Array): Shared array for storing error messages
24
+ q (Queue): Queue for storing task results
25
+
26
+ Notes:
27
+ - Updates task status in status_dict while process is running
28
+ - Handles task cancellation, completion, and error states
29
+ - Cleans up process resources after completion
30
+ """
31
+ try:
32
+ with status_dict_lock:
33
+ status_dict[task_id].status = "Processing"
34
+
35
+ while p.is_alive():
36
+ sleep(1)
37
+ with progress.get_lock():
38
+ current_progress = progress.value
39
+ with status_dict_lock:
40
+ status_dict[task_id].progress = current_progress
41
+
42
+ # Check if the task has been cancelled via status_dict
43
+ if status_dict[task_id].status == "Cancelled":
44
+ p.terminate()
45
+ break
46
+
47
+ if current_progress == -1:
48
+ with status_dict_lock:
49
+ status_dict[task_id].status = "Error"
50
+ with error_message.get_lock():
51
+ status_dict[task_id].error_message = error_message.value.decode().rstrip('\x00')
52
+ break
53
+
54
+ p.join()
55
+
56
+ with status_dict_lock:
57
+ status = status_dict[task_id]
58
+ if status.status != "Cancelled":
59
+ if progress.value == 100:
60
+ status.status = "Completed"
61
+ if not q.empty():
62
+ status.results = q.get()
63
+ elif progress.value != -1:
64
+ status_dict[task_id].status = "Unknown Error"
65
+
66
+ finally:
67
+ if p.is_alive():
68
+ p.terminate()
69
+ p.join()
70
+ process_manager.remove_process(task_id) # Remove from process manager
71
+ del p, progress, error_message
72
+ gc.collect()
73
+
74
+
75
+ def start_process(polars_serializable_object: bytes, task_id: str,
76
+ operation: models.OperationType,
77
+ file_ref: str, flowfile_flow_id: int,
78
+ flowfile_node_id: flowfile_node_id_type,
79
+ kwargs: dict = None) -> None:
80
+ """
81
+ Starts a new process for handling Polars dataframe operations.
82
+
83
+ Args:
84
+ polars_serializable_object (bytes): Serialized Polars dataframe
85
+ task_id (str): Unique identifier for the task
86
+ operation (models.OperationType): Type of operation to perform
87
+ file_ref (str): Reference to the file being processed
88
+ kwargs (dict, optional): Additional arguments for the operation. Defaults to {}
89
+ flowfile_flow_id: id of the flow that started the process
90
+ flowfile_node_id: id of the node that started the process
91
+
92
+ Notes:
93
+ - Creates shared memory objects for progress tracking and error handling
94
+ - Initializes and starts a new process for the specified operation
95
+ - Delegates to handle_task for process monitoring
96
+ """
97
+ if kwargs is None:
98
+ kwargs = {}
99
+ process_task = getattr(funcs, operation)
100
+ kwargs['polars_serializable_object'] = polars_serializable_object
101
+ kwargs['progress'] = mp_context.Value('i', 0)
102
+ kwargs['error_message'] = mp_context.Array('c', 1024)
103
+ kwargs['queue'] = Queue(maxsize=1)
104
+ kwargs['file_path'] = file_ref
105
+ kwargs['flowfile_flow_id'] = flowfile_flow_id
106
+ kwargs['flowfile_node_id'] = flowfile_node_id
107
+
108
+ p: Process = mp_context.Process(target=process_task, kwargs=kwargs)
109
+ p.start()
110
+
111
+ process_manager.add_process(task_id, p)
112
+ handle_task(task_id=task_id, p=p, progress=kwargs['progress'], error_message=kwargs['error_message'], q=kwargs['queue'])
113
+
114
+
115
+ def start_generic_process(func_ref: callable, task_id: str,
116
+ file_ref: str, flowfile_flow_id: int,
117
+ flowfile_node_id: flowfile_node_id_type, kwargs: dict = None) -> None:
118
+ """
119
+ Starts a new process for handling generic function execution.
120
+
121
+ Args:
122
+ func_ref (callable): Reference to the function to be executed
123
+ task_id (str): Unique identifier for the task
124
+ file_ref (str): Reference to the file being processed
125
+ flowfile_flow_id: id of the flow that started the process
126
+ flowfile_node_id: id of the node that started the process
127
+ kwargs (dict, optional): Additional arguments for the function. Defaults to None.
128
+
129
+ Notes:
130
+ - Creates shared memory objects for progress tracking and error handling
131
+ - Initializes and starts a new process for the generic function
132
+ - Delegates to handle_task for process monitoring
133
+ """
134
+ kwargs = {} if kwargs is None else kwargs
135
+ kwargs['func'] = func_ref
136
+ kwargs['progress'] = mp_context.Value('i', 0)
137
+ kwargs['error_message'] = mp_context.Array('c', 1024)
138
+ kwargs['queue'] = Queue(maxsize=1)
139
+ kwargs['file_path'] = file_ref
140
+ kwargs['flowfile_flow_id'] = flowfile_flow_id
141
+ kwargs['flowfile_node_id'] = flowfile_node_id
142
+
143
+ process_task = getattr(funcs, 'generic_task')
144
+ p: Process = mp_context.Process(target=process_task, kwargs=kwargs)
145
+ p.start()
146
+
147
+ process_manager.add_process(task_id, p) # Add process to process manager
148
+ handle_task(task_id=task_id, p=p, progress=kwargs['progress'],
149
+ error_message=kwargs['error_message'], q=kwargs['queue'])
150
+
151
+
152
+ def start_fuzzy_process(left_serializable_object: bytes,
153
+ right_serializable_object: bytes,
154
+ file_ref: str,
155
+ fuzzy_maps: List[models.FuzzyMapping],
156
+ task_id: str,
157
+ flowfile_flow_id: int,
158
+ flowfile_node_id: flowfile_node_id_type) -> None:
159
+ """
160
+ Starts a new process for performing fuzzy joining operations on two datasets.
161
+
162
+ Args:
163
+ left_serializable_object (bytes): Serialized left dataframe
164
+ right_serializable_object (bytes): Serialized right dataframe
165
+ file_ref (str): Reference to the file being processed
166
+ fuzzy_maps (List[models.FuzzyMapping]): List of fuzzy mapping configurations
167
+ task_id (str): Unique identifier for the task
168
+ flowfile_flow_id: id of the flow that started the process
169
+ flowfile_node_id: id of the node that started the process
170
+ Notes:
171
+ - Creates shared memory objects for progress tracking and error handling
172
+ - Initializes and starts a new process for fuzzy joining operation
173
+ - Delegates to handle_task for process monitoring
174
+ """
175
+ progress = mp_context.Value('i', 0)
176
+ error_message = mp_context.Array('c', 1024)
177
+ q = Queue(maxsize=1)
178
+
179
+ args: Tuple[bytes, bytes, List[models.FuzzyMapping], mp_context.Array, str, mp_context.Value, Queue, int, flowfile_node_id_type] = \
180
+ (left_serializable_object, right_serializable_object, fuzzy_maps, error_message, file_ref, progress, q,
181
+ flowfile_flow_id, flowfile_node_id)
182
+
183
+ p: Process = mp_context.Process(target=funcs.fuzzy_join_task, args=args)
184
+ p.start()
185
+
186
+ process_manager.add_process(task_id, p) # Add process to process manager
187
+ handle_task(task_id=task_id, p=p, progress=progress, error_message=error_message, q=q)
@@ -0,0 +1,25 @@
1
+ import polars as pl
2
+ from polars.exceptions import PanicException
3
+ from dataclasses import dataclass
4
+
5
+
6
+ def collect_lazy_frame(lf: pl.LazyFrame) -> pl.DataFrame:
7
+ try:
8
+ return lf.collect(engine="streaming")
9
+ except PanicException:
10
+ return lf.collect(engine="auto")
11
+
12
+
13
+ @dataclass
14
+ class CollectStreamingInfo:
15
+ __slots__ = 'df', 'streaming_collect_available'
16
+ df: pl.DataFrame
17
+ streaming_collect_available: bool
18
+
19
+
20
+ def collect_lazy_frame_and_get_streaming_info(lf: pl.LazyFrame) -> CollectStreamingInfo:
21
+ try:
22
+ df = lf.collect(engine="streaming")
23
+ return CollectStreamingInfo(df, True)
24
+ except PanicException:
25
+ return CollectStreamingInfo(lf.collect(), False)
test_utils/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Test utilities for Flowfile project."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1 @@
1
+ """PostgreSQL test utilities."""
@@ -0,0 +1,109 @@
1
+ """
2
+ Command-line interface for PostgreSQL test database management.
3
+
4
+ This module provides command-line functions that can be called via Poetry scripts
5
+ to start and stop PostgreSQL containers with sample data.
6
+ """
7
+
8
+ import argparse
9
+ import sys
10
+ import logging
11
+ from . import fixtures
12
+
13
+ # Set up logging
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s - %(levelname)s - %(message)s',
17
+ datefmt='%Y-%m-%d %H:%M:%S'
18
+ )
19
+ logger = logging.getLogger("postgres_commands")
20
+
21
+
22
+ def start_postgres():
23
+ """
24
+ Start PostgreSQL container with sample data.
25
+
26
+ This function is the entry point for the 'start_postgres' Poetry script.
27
+ It parses command-line arguments and starts a PostgreSQL container
28
+ with the requested configuration.
29
+
30
+ Returns:
31
+ Exit code (0 for success, 1 for failure)
32
+ """
33
+ # Check if Docker is available first
34
+ if not fixtures.is_docker_available():
35
+ logger.warning("Docker is not available. Cannot start PostgreSQL container.")
36
+ print("\n" + "=" * 50)
37
+ print("SKIPPING: Docker is not available on this system")
38
+ print("Tests requiring Docker will need to be skipped")
39
+ print("=" * 50 + "\n")
40
+ return 0 # Return success to allow pipeline to continue
41
+
42
+ parser = argparse.ArgumentParser(description="Start PostgreSQL container with sample data")
43
+ parser.add_argument("--schema", choices=["movies", "stocks"], default="movies",
44
+ help="Sample schema to use (movies or stocks)")
45
+ parser.add_argument("--port", type=int, default=5433,
46
+ help="Port to expose PostgreSQL on (default: 5433)")
47
+ parser.add_argument("--user", default="testuser",
48
+ help="PostgreSQL username (default: testuser)")
49
+ parser.add_argument("--password", default="testpass",
50
+ help="PostgreSQL password (default: testpass)")
51
+ parser.add_argument("--db", default="testdb",
52
+ help="PostgreSQL database name (default: testdb)")
53
+ parser.add_argument("--container-name", default="test-postgres-sample",
54
+ help="Docker container name (default: test-postgres-sample)")
55
+ parser.add_argument("--image-tag", default="test-sample-db",
56
+ help="Docker image tag (default: test-sample-db)")
57
+
58
+ args = parser.parse_args()
59
+
60
+ # Setup and start
61
+ if fixtures.setup_postgres_samples(args.schema, args.port, args.user, args.password, args.db, args.image_tag):
62
+ if fixtures.start_postgres_container(args.container_name, args.port, args.image_tag)[1]:
63
+ fixtures.print_connection_info(
64
+ "localhost", args.port, args.db, args.user, args.password, args.container_name
65
+ )
66
+ return 0
67
+ return 1
68
+
69
+
70
+ def stop_postgres():
71
+ """
72
+ Stop PostgreSQL container.
73
+
74
+ This function is the entry point for the 'stop_postgres' Poetry script.
75
+ It parses command-line arguments and stops the specified PostgreSQL container.
76
+
77
+ Returns:
78
+ Exit code (0 for success, 1 for failure)
79
+ """
80
+ # Check if Docker is available first
81
+ if not fixtures.is_docker_available():
82
+ logger.warning("Docker is not available. No PostgreSQL container to stop.")
83
+ return 0 # Return success to allow pipeline to continue
84
+
85
+ parser = argparse.ArgumentParser(description="Stop PostgreSQL container")
86
+ parser.add_argument("--container-name", default="test-postgres-sample",
87
+ help="Docker container name (default: test-postgres-sample)")
88
+ parser.add_argument("--timeout", type=int, default=15,
89
+ help="Timeout for graceful container stop in seconds (default: 15)")
90
+
91
+ args = parser.parse_args()
92
+
93
+ if fixtures.stop_postgres_container(args.container_name, args.timeout):
94
+ print(f"Container {args.container_name} stopped successfully")
95
+ return 0
96
+ else:
97
+ print(f"Failed to stop container {args.container_name}")
98
+ return 1
99
+
100
+
101
+ if __name__ == "__main__":
102
+ # Allow direct script execution for testing
103
+ if len(sys.argv) > 1 and sys.argv[1] == "start":
104
+ sys.exit(start_postgres())
105
+ elif len(sys.argv) > 1 and sys.argv[1] == "stop":
106
+ sys.exit(stop_postgres())
107
+ else:
108
+ print("Usage: python -m test_utils.postgres.commands [start|stop] [options]")
109
+ sys.exit(1)