Flowfile 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- build_backends/__init__.py +0 -0
- build_backends/main.py +313 -0
- build_backends/main_prd.py +202 -0
- flowfile/__init__.py +71 -0
- flowfile/__main__.py +24 -0
- flowfile-0.2.2.dist-info/LICENSE +21 -0
- flowfile-0.2.2.dist-info/METADATA +225 -0
- flowfile-0.2.2.dist-info/RECORD +171 -0
- flowfile-0.2.2.dist-info/WHEEL +4 -0
- flowfile-0.2.2.dist-info/entry_points.txt +9 -0
- flowfile_core/__init__.py +13 -0
- flowfile_core/auth/__init__.py +0 -0
- flowfile_core/auth/jwt.py +140 -0
- flowfile_core/auth/models.py +40 -0
- flowfile_core/auth/secrets.py +178 -0
- flowfile_core/configs/__init__.py +35 -0
- flowfile_core/configs/flow_logger.py +433 -0
- flowfile_core/configs/node_store/__init__.py +0 -0
- flowfile_core/configs/node_store/nodes.py +98 -0
- flowfile_core/configs/settings.py +120 -0
- flowfile_core/database/__init__.py +0 -0
- flowfile_core/database/connection.py +51 -0
- flowfile_core/database/init_db.py +45 -0
- flowfile_core/database/models.py +41 -0
- flowfile_core/fileExplorer/__init__.py +0 -0
- flowfile_core/fileExplorer/funcs.py +259 -0
- flowfile_core/fileExplorer/utils.py +53 -0
- flowfile_core/flowfile/FlowfileFlow.py +1403 -0
- flowfile_core/flowfile/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
- flowfile_core/flowfile/analytics/__init__.py +0 -0
- flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
- flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
- flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
- flowfile_core/flowfile/analytics/utils.py +9 -0
- flowfile_core/flowfile/connection_manager/__init__.py +3 -0
- flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
- flowfile_core/flowfile/connection_manager/models.py +10 -0
- flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
- flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
- flowfile_core/flowfile/database_connection_manager/models.py +15 -0
- flowfile_core/flowfile/extensions.py +36 -0
- flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
- flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
- flowfile_core/flowfile/flow_data_engine/types.py +0 -0
- flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
- flowfile_core/flowfile/flow_node/__init__.py +0 -0
- flowfile_core/flowfile/flow_node/flow_node.py +771 -0
- flowfile_core/flowfile/flow_node/models.py +111 -0
- flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
- flowfile_core/flowfile/handler.py +123 -0
- flowfile_core/flowfile/manage/__init__.py +0 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
- flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
- flowfile_core/flowfile/manage/open_flowfile.py +136 -0
- flowfile_core/flowfile/setting_generator/__init__.py +2 -0
- flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
- flowfile_core/flowfile/setting_generator/settings.py +176 -0
- flowfile_core/flowfile/sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
- flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
- flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
- flowfile_core/flowfile/util/__init__.py +0 -0
- flowfile_core/flowfile/util/calculate_layout.py +137 -0
- flowfile_core/flowfile/util/execution_orderer.py +141 -0
- flowfile_core/flowfile/utils.py +106 -0
- flowfile_core/main.py +138 -0
- flowfile_core/routes/__init__.py +0 -0
- flowfile_core/routes/auth.py +34 -0
- flowfile_core/routes/logs.py +163 -0
- flowfile_core/routes/public.py +10 -0
- flowfile_core/routes/routes.py +601 -0
- flowfile_core/routes/secrets.py +85 -0
- flowfile_core/run_lock.py +11 -0
- flowfile_core/schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
- flowfile_core/schemas/defaults.py +9 -0
- flowfile_core/schemas/external_sources/__init__.py +0 -0
- flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
- flowfile_core/schemas/input_schema.py +477 -0
- flowfile_core/schemas/models.py +193 -0
- flowfile_core/schemas/output_model.py +115 -0
- flowfile_core/schemas/schemas.py +106 -0
- flowfile_core/schemas/transform_schema.py +569 -0
- flowfile_core/secrets/__init__.py +0 -0
- flowfile_core/secrets/secrets.py +64 -0
- flowfile_core/utils/__init__.py +0 -0
- flowfile_core/utils/arrow_reader.py +247 -0
- flowfile_core/utils/excel_file_manager.py +18 -0
- flowfile_core/utils/fileManager.py +45 -0
- flowfile_core/utils/fl_executor.py +38 -0
- flowfile_core/utils/utils.py +8 -0
- flowfile_frame/__init__.py +56 -0
- flowfile_frame/__main__.py +12 -0
- flowfile_frame/adapters.py +17 -0
- flowfile_frame/expr.py +1163 -0
- flowfile_frame/flow_frame.py +2093 -0
- flowfile_frame/group_frame.py +199 -0
- flowfile_frame/join.py +75 -0
- flowfile_frame/selectors.py +242 -0
- flowfile_frame/utils.py +184 -0
- flowfile_worker/__init__.py +55 -0
- flowfile_worker/configs.py +95 -0
- flowfile_worker/create/__init__.py +37 -0
- flowfile_worker/create/funcs.py +146 -0
- flowfile_worker/create/models.py +86 -0
- flowfile_worker/create/pl_types.py +35 -0
- flowfile_worker/create/read_excel_tables.py +110 -0
- flowfile_worker/create/utils.py +84 -0
- flowfile_worker/external_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
- flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
- flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- flowfile_worker/external_sources/sql_source/__init__.py +0 -0
- flowfile_worker/external_sources/sql_source/main.py +56 -0
- flowfile_worker/external_sources/sql_source/models.py +72 -0
- flowfile_worker/flow_logger.py +58 -0
- flowfile_worker/funcs.py +327 -0
- flowfile_worker/main.py +108 -0
- flowfile_worker/models.py +95 -0
- flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
- flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
- flowfile_worker/polars_fuzzy_match/models.py +36 -0
- flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
- flowfile_worker/polars_fuzzy_match/process.py +86 -0
- flowfile_worker/polars_fuzzy_match/utils.py +50 -0
- flowfile_worker/process_manager.py +36 -0
- flowfile_worker/routes.py +440 -0
- flowfile_worker/secrets.py +148 -0
- flowfile_worker/spawner.py +187 -0
- flowfile_worker/utils.py +25 -0
- test_utils/__init__.py +3 -0
- test_utils/postgres/__init__.py +1 -0
- test_utils/postgres/commands.py +109 -0
- test_utils/postgres/fixtures.py +417 -0
|
@@ -0,0 +1,503 @@
|
|
|
1
|
+
# Standard library imports
|
|
2
|
+
from base64 import decodebytes, encodebytes
|
|
3
|
+
import io
|
|
4
|
+
import threading
|
|
5
|
+
from time import sleep
|
|
6
|
+
from typing import Any, List, Literal, Optional
|
|
7
|
+
from uuid import uuid4
|
|
8
|
+
|
|
9
|
+
import polars as pl
|
|
10
|
+
import requests
|
|
11
|
+
|
|
12
|
+
from flowfile_core.configs import logger
|
|
13
|
+
from flowfile_core.configs.settings import WORKER_URL
|
|
14
|
+
from flowfile_core.flowfile.flow_data_engine.subprocess_operations.models import (
|
|
15
|
+
FuzzyJoinInput,
|
|
16
|
+
FuzzyMap,
|
|
17
|
+
OperationType,
|
|
18
|
+
PolarsOperation,
|
|
19
|
+
Status
|
|
20
|
+
)
|
|
21
|
+
from flowfile_core.flowfile.sources.external_sources.airbyte_sources.models import AirbyteSettings
|
|
22
|
+
from flowfile_core.flowfile.sources.external_sources.sql_source.models import (DatabaseExternalReadSettings,
|
|
23
|
+
DatabaseExternalWriteSettings)
|
|
24
|
+
from flowfile_core.schemas.input_schema import (
|
|
25
|
+
ReceivedCsvTable,
|
|
26
|
+
ReceivedExcelTable,
|
|
27
|
+
ReceivedJsonTable,
|
|
28
|
+
ReceivedParquetTable
|
|
29
|
+
)
|
|
30
|
+
from flowfile_core.utils.arrow_reader import read
|
|
31
|
+
|
|
32
|
+
ReceivedTableCollection = ReceivedCsvTable | ReceivedParquetTable | ReceivedJsonTable | ReceivedExcelTable
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def trigger_df_operation(flow_id: int, node_id: int | str, lf: pl.LazyFrame, file_ref: str, operation_type: OperationType = 'store') -> Status:
|
|
36
|
+
encoded_operation = encodebytes(lf.serialize()).decode()
|
|
37
|
+
_json = {'task_id': file_ref, 'operation': encoded_operation, 'operation_type': operation_type,
|
|
38
|
+
'flowfile_flow_id': flow_id, 'flowfile_node_id': node_id}
|
|
39
|
+
v = requests.post(url=f'{WORKER_URL}/submit_query/', json=_json)
|
|
40
|
+
if not v.ok:
|
|
41
|
+
raise Exception(f'Could not cache the data, {v.text}')
|
|
42
|
+
return Status(**v.json())
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def trigger_sample_operation(lf: pl.LazyFrame, file_ref: str, flow_id: int, node_id: str | int, sample_size: int = 100) -> Status:
|
|
46
|
+
encoded_operation = encodebytes(lf.serialize()).decode()
|
|
47
|
+
_json = {'task_id': file_ref, 'operation': encoded_operation, 'operation_type': 'store_sample',
|
|
48
|
+
'sample_size': sample_size, 'flowfile_flow_id': flow_id, 'flowfile_node_id': node_id}
|
|
49
|
+
v = requests.post(url=f'{WORKER_URL}/store_sample/', json=_json)
|
|
50
|
+
if not v.ok:
|
|
51
|
+
raise Exception(f'Could not cache the data, {v.text}')
|
|
52
|
+
return Status(**v.json())
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def trigger_fuzzy_match_operation(left_df: pl.LazyFrame, right_df: pl.LazyFrame,
|
|
56
|
+
fuzzy_maps: List[FuzzyMap],
|
|
57
|
+
file_ref: str,
|
|
58
|
+
flow_id: int,
|
|
59
|
+
node_id: int | str) -> Status:
|
|
60
|
+
left_serializable_object = PolarsOperation(operation=encodebytes(left_df.serialize()))
|
|
61
|
+
right_serializable_object = PolarsOperation(operation=encodebytes(right_df.serialize()))
|
|
62
|
+
fuzzy_join_input = FuzzyJoinInput(left_df_operation=left_serializable_object,
|
|
63
|
+
right_df_operation=right_serializable_object,
|
|
64
|
+
fuzzy_maps=fuzzy_maps,
|
|
65
|
+
task_id=file_ref,
|
|
66
|
+
flowfile_flow_id=flow_id,
|
|
67
|
+
flowfile_node_id=node_id
|
|
68
|
+
)
|
|
69
|
+
v = requests.post(f'{WORKER_URL}/add_fuzzy_join', data=fuzzy_join_input.model_dump_json())
|
|
70
|
+
if not v.ok:
|
|
71
|
+
raise Exception(f'Could not cache the data, {v.text}')
|
|
72
|
+
return Status(**v.json())
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def trigger_create_operation(flow_id: int, node_id: int | str, received_table: ReceivedTableCollection,
|
|
76
|
+
file_type: str = Literal['csv', 'parquet', 'json', 'excel']):
|
|
77
|
+
f = requests.post(url=f'{WORKER_URL}/create_table/{file_type}', data=received_table.model_dump_json(),
|
|
78
|
+
params={'flowfile_flow_id': flow_id, 'flowfile_node_id': node_id})
|
|
79
|
+
if not f.ok:
|
|
80
|
+
raise Exception(f'Could not cache the data, {f.text}')
|
|
81
|
+
return Status(**f.json())
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def trigger_airbyte_collector(airbyte_settings: AirbyteSettings):
|
|
85
|
+
f = requests.post(url=f'{WORKER_URL}/store_airbyte_result', data=airbyte_settings.model_dump_json())
|
|
86
|
+
if not f.ok:
|
|
87
|
+
raise Exception(f'Could not cache the data, {f.text}')
|
|
88
|
+
return Status(**f.json())
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def trigger_database_read_collector(database_external_read_settings: DatabaseExternalReadSettings):
|
|
92
|
+
f = requests.post(url=f'{WORKER_URL}/store_database_read_result',
|
|
93
|
+
data=database_external_read_settings.model_dump_json())
|
|
94
|
+
if not f.ok:
|
|
95
|
+
raise Exception(f'Could not cache the data, {f.text}')
|
|
96
|
+
return Status(**f.json())
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def trigger_database_write(database_external_write_settings: DatabaseExternalWriteSettings):
|
|
100
|
+
f = requests.post(url=f'{WORKER_URL}/store_database_write_result',
|
|
101
|
+
data=database_external_write_settings.model_dump_json())
|
|
102
|
+
if not f.ok:
|
|
103
|
+
raise Exception(f'Could not cache the data, {f.text}')
|
|
104
|
+
return Status(**f.json())
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_results(file_ref: str) -> Status | None:
|
|
108
|
+
f = requests.get(f'{WORKER_URL}/status/{file_ref}')
|
|
109
|
+
if f.status_code == 200:
|
|
110
|
+
return Status(**f.json())
|
|
111
|
+
else:
|
|
112
|
+
raise Exception(f'Could not fetch the data, {f.text}')
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def results_exists(file_ref: str):
|
|
116
|
+
f = requests.get(f'{WORKER_URL}/status/{file_ref}')
|
|
117
|
+
if f.status_code == 200:
|
|
118
|
+
if f.json()['status'] == 'Completed':
|
|
119
|
+
return True
|
|
120
|
+
return False
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def get_df_result(encoded_df: str) -> pl.LazyFrame:
|
|
124
|
+
r = decodebytes(encoded_df.encode())
|
|
125
|
+
return pl.LazyFrame.deserialize(io.BytesIO(r))
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def get_external_df_result(file_ref: str) -> pl.LazyFrame | None:
|
|
129
|
+
status = get_results(file_ref)
|
|
130
|
+
if status.status != 'Completed':
|
|
131
|
+
raise Exception(f"Status is not completed, {status.status}")
|
|
132
|
+
if status.result_type == 'polars':
|
|
133
|
+
return get_df_result(status.results)
|
|
134
|
+
else:
|
|
135
|
+
raise Exception(f"Result type is not polars, {status.result_type}")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def get_status(file_ref: str) -> Status:
|
|
139
|
+
status_response = requests.get(f'{WORKER_URL}/status/{file_ref}')
|
|
140
|
+
if status_response.status_code == 200:
|
|
141
|
+
return Status(**status_response.json())
|
|
142
|
+
else:
|
|
143
|
+
raise Exception(f"Could not fetch the status, {status_response.text}")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def cancel_task(file_ref: str) -> bool:
|
|
147
|
+
"""
|
|
148
|
+
Cancels a running task by making a request to the worker service.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
file_ref: The unique identifier of the task to cancel
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
bool: True if cancellation was successful, False otherwise
|
|
155
|
+
|
|
156
|
+
Raises:
|
|
157
|
+
Exception: If there's an error communicating with the worker service
|
|
158
|
+
"""
|
|
159
|
+
try:
|
|
160
|
+
response = requests.post(f'{WORKER_URL}/cancel_task/{file_ref}')
|
|
161
|
+
if response.ok:
|
|
162
|
+
return True
|
|
163
|
+
return False
|
|
164
|
+
except requests.RequestException as e:
|
|
165
|
+
raise Exception(f'Failed to cancel task: {str(e)}')
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class BaseFetcher:
|
|
169
|
+
result: Optional[Any] = None
|
|
170
|
+
started: bool = False
|
|
171
|
+
running: bool = False
|
|
172
|
+
error_code: int = 0
|
|
173
|
+
error_description: Optional[str] = None
|
|
174
|
+
file_ref: Optional[str] = None
|
|
175
|
+
|
|
176
|
+
def __init__(self, file_ref: str = None):
|
|
177
|
+
self.file_ref = file_ref if file_ref else str(uuid4())
|
|
178
|
+
self.stop_event = threading.Event()
|
|
179
|
+
self.thread = threading.Thread(target=self._fetch_cached_df)
|
|
180
|
+
self.result = None
|
|
181
|
+
self.error_description = None
|
|
182
|
+
self.running = False
|
|
183
|
+
self.started = False
|
|
184
|
+
self.condition = threading.Condition()
|
|
185
|
+
self.error_code = 0
|
|
186
|
+
|
|
187
|
+
def _fetch_cached_df(self):
|
|
188
|
+
with self.condition:
|
|
189
|
+
if self.running:
|
|
190
|
+
logger.info('Already running the fetching')
|
|
191
|
+
return
|
|
192
|
+
|
|
193
|
+
sleep_time = 1
|
|
194
|
+
self.running = True
|
|
195
|
+
while not self.stop_event.is_set():
|
|
196
|
+
try:
|
|
197
|
+
r = requests.get(f'{WORKER_URL}/status/{self.file_ref}')
|
|
198
|
+
if r.status_code == 200:
|
|
199
|
+
status = Status(**r.json())
|
|
200
|
+
if status.status == 'Completed':
|
|
201
|
+
self._handle_completion(status)
|
|
202
|
+
return
|
|
203
|
+
elif status.status == 'Error':
|
|
204
|
+
self._handle_error(1, status.error_message)
|
|
205
|
+
break
|
|
206
|
+
elif status.status == 'Unknown Error':
|
|
207
|
+
self._handle_error(-1,
|
|
208
|
+
'There was an unknown error with the process, and the process got killed by the server')
|
|
209
|
+
break
|
|
210
|
+
else:
|
|
211
|
+
self._handle_error(2, r.text)
|
|
212
|
+
break
|
|
213
|
+
except requests.RequestException as e:
|
|
214
|
+
self._handle_error(2, f"Request failed: {e}")
|
|
215
|
+
break
|
|
216
|
+
|
|
217
|
+
sleep(sleep_time)
|
|
218
|
+
|
|
219
|
+
self._handle_cancellation()
|
|
220
|
+
|
|
221
|
+
def _handle_completion(self, status):
|
|
222
|
+
self.running = False
|
|
223
|
+
self.condition.notify_all()
|
|
224
|
+
if status.result_type == 'polars':
|
|
225
|
+
self.result = get_df_result(status.results)
|
|
226
|
+
else:
|
|
227
|
+
self.result = status.results
|
|
228
|
+
|
|
229
|
+
def _handle_error(self, code, description):
|
|
230
|
+
self.error_code = code
|
|
231
|
+
self.error_description = description
|
|
232
|
+
self.running = False
|
|
233
|
+
self.condition.notify_all()
|
|
234
|
+
|
|
235
|
+
def _handle_cancellation(self):
|
|
236
|
+
logger.warning("Fetch operation cancelled")
|
|
237
|
+
if self.error_description is not None:
|
|
238
|
+
logger.warning(self.error_description)
|
|
239
|
+
self.running = False
|
|
240
|
+
self.condition.notify_all()
|
|
241
|
+
|
|
242
|
+
def start(self):
|
|
243
|
+
if self.running:
|
|
244
|
+
logger.info('Already running the fetching')
|
|
245
|
+
return
|
|
246
|
+
if not self.started:
|
|
247
|
+
self.thread.start()
|
|
248
|
+
self.started = True
|
|
249
|
+
|
|
250
|
+
def cancel(self):
|
|
251
|
+
"""
|
|
252
|
+
Cancels the current task both locally and on the worker service.
|
|
253
|
+
Also cleans up any resources being used.
|
|
254
|
+
"""
|
|
255
|
+
logger.warning('Cancelling the operation')
|
|
256
|
+
try:
|
|
257
|
+
cancel_task(self.file_ref)
|
|
258
|
+
except Exception as e:
|
|
259
|
+
logger.error(f'Failed to cancel task on worker: {str(e)}')
|
|
260
|
+
|
|
261
|
+
# Then stop the local monitoring thread
|
|
262
|
+
self.stop_event.set()
|
|
263
|
+
self.thread.join()
|
|
264
|
+
|
|
265
|
+
# Update local state
|
|
266
|
+
with self.condition:
|
|
267
|
+
self.running = False
|
|
268
|
+
self.error_description = "Task cancelled by user"
|
|
269
|
+
self.condition.notify_all()
|
|
270
|
+
|
|
271
|
+
def get_result(self) -> Optional[Any]:
|
|
272
|
+
if not self.started:
|
|
273
|
+
self.start()
|
|
274
|
+
with self.condition:
|
|
275
|
+
while self.running and self.result is None:
|
|
276
|
+
self.condition.wait() # Wait until notified
|
|
277
|
+
if self.error_description is not None:
|
|
278
|
+
raise Exception(self.error_description)
|
|
279
|
+
return self.result
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
class ExternalDfFetcher(BaseFetcher):
|
|
283
|
+
status: Optional[Status] = None
|
|
284
|
+
|
|
285
|
+
def __init__(self, flow_id: int, node_id: int | str, lf: pl.LazyFrame | pl.DataFrame, file_ref: str = None,
|
|
286
|
+
wait_on_completion: bool = True,
|
|
287
|
+
operation_type: OperationType = 'store'):
|
|
288
|
+
super().__init__(file_ref=file_ref)
|
|
289
|
+
lf = lf.lazy() if isinstance(lf, pl.DataFrame) else lf
|
|
290
|
+
r = trigger_df_operation(lf=lf, file_ref=self.file_ref, operation_type=operation_type,
|
|
291
|
+
node_id=node_id, flow_id=flow_id)
|
|
292
|
+
self.running = r.status == 'Processing'
|
|
293
|
+
if wait_on_completion:
|
|
294
|
+
_ = self.get_result()
|
|
295
|
+
self.status = get_status(self.file_ref)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
class ExternalSampler(BaseFetcher):
|
|
299
|
+
status: Optional[Status] = None
|
|
300
|
+
|
|
301
|
+
def __init__(self, lf: pl.LazyFrame | pl.DataFrame, node_id: str | int, flow_id: int, file_ref: str = None, wait_on_completion: bool = True,
|
|
302
|
+
sample_size: int = 100):
|
|
303
|
+
super().__init__(file_ref=file_ref)
|
|
304
|
+
lf = lf.lazy() if isinstance(lf, pl.DataFrame) else lf
|
|
305
|
+
r = trigger_sample_operation(lf=lf, file_ref=file_ref, sample_size=sample_size, node_id=node_id, flow_id=flow_id)
|
|
306
|
+
self.running = r.status == 'Processing'
|
|
307
|
+
if wait_on_completion:
|
|
308
|
+
_ = self.get_result()
|
|
309
|
+
self.status = get_status(self.file_ref)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
class ExternalFuzzyMatchFetcher(BaseFetcher):
|
|
313
|
+
def __init__(self, left_df: pl.LazyFrame, right_df: pl.LazyFrame, fuzzy_maps: List[Any], flow_id: int,
|
|
314
|
+
node_id: int | str,
|
|
315
|
+
file_ref: str = None,
|
|
316
|
+
wait_on_completion: bool = True):
|
|
317
|
+
super().__init__(file_ref=file_ref)
|
|
318
|
+
|
|
319
|
+
r = trigger_fuzzy_match_operation(left_df=left_df, right_df=right_df, fuzzy_maps=fuzzy_maps,
|
|
320
|
+
file_ref=file_ref, flow_id=flow_id, node_id=node_id)
|
|
321
|
+
self.file_ref = r.background_task_id
|
|
322
|
+
self.running = r.status == 'Processing'
|
|
323
|
+
if wait_on_completion:
|
|
324
|
+
_ = self.get_result()
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class ExternalCreateFetcher(BaseFetcher):
|
|
328
|
+
def __init__(self, received_table: ReceivedTableCollection, node_id: int, flow_id: int,
|
|
329
|
+
file_type: str = 'csv', wait_on_completion: bool = True):
|
|
330
|
+
r = trigger_create_operation(received_table=received_table, file_type=file_type,
|
|
331
|
+
node_id=node_id, flow_id=flow_id)
|
|
332
|
+
super().__init__(file_ref=r.background_task_id)
|
|
333
|
+
self.running = r.status == 'Processing'
|
|
334
|
+
if wait_on_completion:
|
|
335
|
+
_ = self.get_result()
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class ExternalAirbyteFetcher(BaseFetcher):
|
|
339
|
+
def __init__(self, airbyte_settings: AirbyteSettings, wait_on_completion: bool = True):
|
|
340
|
+
r = trigger_airbyte_collector(airbyte_settings)
|
|
341
|
+
super().__init__(file_ref=r.background_task_id)
|
|
342
|
+
self.running = r.status == 'Processing'
|
|
343
|
+
if wait_on_completion:
|
|
344
|
+
_ = self.get_result()
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
class ExternalDatabaseFetcher(BaseFetcher):
|
|
348
|
+
def __init__(self, database_external_read_settings: DatabaseExternalReadSettings,
|
|
349
|
+
wait_on_completion: bool = True):
|
|
350
|
+
r = trigger_database_read_collector(database_external_read_settings=database_external_read_settings)
|
|
351
|
+
super().__init__(file_ref=r.background_task_id)
|
|
352
|
+
self.running = r.status == 'Processing'
|
|
353
|
+
if wait_on_completion:
|
|
354
|
+
_ = self.get_result()
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
class ExternalDatabaseWriter(BaseFetcher):
|
|
358
|
+
def __init__(self, database_external_write_settings: DatabaseExternalWriteSettings,
|
|
359
|
+
wait_on_completion: bool = True):
|
|
360
|
+
r = trigger_database_write(database_external_write_settings=database_external_write_settings)
|
|
361
|
+
super().__init__(file_ref=r.background_task_id)
|
|
362
|
+
self.running = r.status == 'Processing'
|
|
363
|
+
if wait_on_completion:
|
|
364
|
+
_ = self.get_result()
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
class ExternalExecutorTracker:
|
|
368
|
+
result: Optional[pl.LazyFrame]
|
|
369
|
+
started: bool = False
|
|
370
|
+
running: bool = False
|
|
371
|
+
error_code: int = 0
|
|
372
|
+
error_description: Optional[str] = None
|
|
373
|
+
file_ref: str = None
|
|
374
|
+
|
|
375
|
+
def __init__(self, initial_response: Status, wait_on_completion: bool = True):
|
|
376
|
+
self.file_ref = initial_response.background_task_id
|
|
377
|
+
self.stop_event = threading.Event()
|
|
378
|
+
self.thread = threading.Thread(target=self._fetch_cached_df)
|
|
379
|
+
self.result = None
|
|
380
|
+
self.error_description = None
|
|
381
|
+
self.running = initial_response.status == 'Processing'
|
|
382
|
+
self.condition = threading.Condition()
|
|
383
|
+
if wait_on_completion:
|
|
384
|
+
_ = self.get_result()
|
|
385
|
+
|
|
386
|
+
def _fetch_cached_df(self):
|
|
387
|
+
with self.condition:
|
|
388
|
+
if self.running:
|
|
389
|
+
logger.info('Already running the fetching')
|
|
390
|
+
return
|
|
391
|
+
sleep_time = 1
|
|
392
|
+
self.running = True
|
|
393
|
+
while not self.stop_event.is_set():
|
|
394
|
+
try:
|
|
395
|
+
r = requests.get(f'{WORKER_URL}/status/{self.file_ref}')
|
|
396
|
+
if r.status_code == 200:
|
|
397
|
+
status = Status(**r.json())
|
|
398
|
+
if status.status == 'Completed':
|
|
399
|
+
self.running = False
|
|
400
|
+
self.condition.notify_all() # Notify all waiting threads
|
|
401
|
+
if status.result_type == 'polars':
|
|
402
|
+
self.result = get_df_result(status.results)
|
|
403
|
+
else:
|
|
404
|
+
self.result = status.results
|
|
405
|
+
return
|
|
406
|
+
elif status.status == 'Error':
|
|
407
|
+
self.error_code = 1
|
|
408
|
+
self.error_description = status.error_message
|
|
409
|
+
break
|
|
410
|
+
elif status.status == 'Unknown Error':
|
|
411
|
+
self.error_code = -1
|
|
412
|
+
self.error_description = 'There was an unknown error with the process, and the process got killed by the server'
|
|
413
|
+
break
|
|
414
|
+
else:
|
|
415
|
+
self.error_description = r.text
|
|
416
|
+
self.error_code = 2
|
|
417
|
+
break
|
|
418
|
+
except requests.RequestException as e:
|
|
419
|
+
self.error_code = 2
|
|
420
|
+
self.error_description = f"Request failed: {e}"
|
|
421
|
+
break
|
|
422
|
+
|
|
423
|
+
sleep(sleep_time)
|
|
424
|
+
# logger.info('Fetching the data')
|
|
425
|
+
|
|
426
|
+
logger.warning("Fetch operation cancelled")
|
|
427
|
+
if self.error_description is not None:
|
|
428
|
+
self.running = False
|
|
429
|
+
logger.warning(self.error_description)
|
|
430
|
+
self.condition.notify_all()
|
|
431
|
+
return
|
|
432
|
+
|
|
433
|
+
def start(self):
|
|
434
|
+
self.started = True
|
|
435
|
+
if self.running:
|
|
436
|
+
logger.info('Already running the fetching')
|
|
437
|
+
return
|
|
438
|
+
self.thread.start()
|
|
439
|
+
|
|
440
|
+
def cancel(self):
|
|
441
|
+
logger.warning('Cancelling the operation')
|
|
442
|
+
self.thread.join()
|
|
443
|
+
|
|
444
|
+
self.running = False
|
|
445
|
+
|
|
446
|
+
def get_result(self) -> pl.LazyFrame | Any | None:
|
|
447
|
+
if not self.started:
|
|
448
|
+
self.start()
|
|
449
|
+
with self.condition:
|
|
450
|
+
while self.running and self.result is None:
|
|
451
|
+
self.condition.wait() # Wait until notified
|
|
452
|
+
if self.error_description is not None:
|
|
453
|
+
raise Exception(self.error_description)
|
|
454
|
+
return self.result
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def fetch_unique_values(lf: pl.LazyFrame) -> List[str]:
|
|
458
|
+
"""
|
|
459
|
+
Fetches unique values from a specified column in a LazyFrame, attempting first via an external fetcher
|
|
460
|
+
and falling back to direct LazyFrame computation if that fails.
|
|
461
|
+
|
|
462
|
+
Args:
|
|
463
|
+
lf: A Polars LazyFrame containing the data
|
|
464
|
+
column: Name of the column to extract unique values from
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
List[str]: List of unique values from the specified column cast to strings
|
|
468
|
+
|
|
469
|
+
Raises:
|
|
470
|
+
ValueError: If no unique values are found or if the fetch operation fails
|
|
471
|
+
|
|
472
|
+
Example:
|
|
473
|
+
>>> lf = pl.LazyFrame({'category': ['A', 'B', 'A', 'C']})
|
|
474
|
+
>>> unique_vals = fetch_unique_values(lf)
|
|
475
|
+
>>> print(unique_vals)
|
|
476
|
+
['A', 'B', 'C']
|
|
477
|
+
"""
|
|
478
|
+
try:
|
|
479
|
+
# Try external source first if lf is provided
|
|
480
|
+
try:
|
|
481
|
+
external_df_fetcher = ExternalDfFetcher(lf=lf, flow_id=1, node_id=-1)
|
|
482
|
+
if external_df_fetcher.status.status == 'Completed':
|
|
483
|
+
|
|
484
|
+
unique_values = read(external_df_fetcher.status.file_ref).column(0).to_pylist()
|
|
485
|
+
if logger:
|
|
486
|
+
logger.info(f"Got {len(unique_values)} unique values from external source")
|
|
487
|
+
return unique_values
|
|
488
|
+
except Exception as e:
|
|
489
|
+
if logger:
|
|
490
|
+
logger.debug(f"Failed reading external file: {str(e)}")
|
|
491
|
+
|
|
492
|
+
unique_values = (lf.unique().collect(engine="streaming")[:, 0].to_list())
|
|
493
|
+
|
|
494
|
+
if not unique_values:
|
|
495
|
+
raise ValueError(f"No unique values found in lazyframe")
|
|
496
|
+
|
|
497
|
+
return unique_values
|
|
498
|
+
|
|
499
|
+
except Exception as e:
|
|
500
|
+
error_msg = f"Failed to fetch unique values: {str(e)}"
|
|
501
|
+
if logger:
|
|
502
|
+
logger.error(error_msg)
|
|
503
|
+
raise ValueError(error_msg) from e
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
|
|
2
|
+
import polars as pl
|
|
3
|
+
from flowfile_core.utils.fl_executor import process_executor
|
|
4
|
+
from flowfile_core.flowfile.flow_data_engine import utils
|
|
5
|
+
|
|
6
|
+
# calculate_schema_threaded = process_executor(wait_on_completion=True, max_workers=1)(utils.calculate_schema)
|
|
7
|
+
write_threaded = process_executor(False, max_workers=1)(utils.write_polars_frame)
|
|
8
|
+
collect_threaded = process_executor(wait_on_completion=False, max_workers=1)(utils.collect)
|
|
9
|
+
cache_polars_frame_to_temp_thread = process_executor(wait_on_completion=True, max_workers=1)(
|
|
10
|
+
utils.cache_polars_frame_to_temp)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@process_executor(False, max_workers=1)
|
|
14
|
+
def do_something_random():
|
|
15
|
+
print('10 seconds')
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# @process_executor(False, max_workers=1)
|
|
19
|
+
def get_join_count(left: pl.LazyFrame, right: pl.LazyFrame, left_on_keys, right_on_keys, how):
|
|
20
|
+
left_joined_df = left.group_by(left_on_keys).count()
|
|
21
|
+
right_joined_df = right.group_by(right_on_keys).count()
|
|
22
|
+
data: pl.LazyFrame = left_joined_df.join(right_joined_df, left_on=left_on_keys,
|
|
23
|
+
right_on=right_on_keys, how=how)
|
|
24
|
+
data = data.with_columns(pl.lit(1).alias('total').cast(pl.UInt64))
|
|
25
|
+
result = data.select((pl.col('total') * pl.col('count') * pl.col('count_right'))).sum()
|
|
26
|
+
n_records = result.collect().to_series().to_list()[0]
|
|
27
|
+
return n_records
|
|
File without changes
|