Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,503 @@
1
+ # Standard library imports
2
+ from base64 import decodebytes, encodebytes
3
+ import io
4
+ import threading
5
+ from time import sleep
6
+ from typing import Any, List, Literal, Optional
7
+ from uuid import uuid4
8
+
9
+ import polars as pl
10
+ import requests
11
+
12
+ from flowfile_core.configs import logger
13
+ from flowfile_core.configs.settings import WORKER_URL
14
+ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.models import (
15
+ FuzzyJoinInput,
16
+ FuzzyMap,
17
+ OperationType,
18
+ PolarsOperation,
19
+ Status
20
+ )
21
+ from flowfile_core.flowfile.sources.external_sources.airbyte_sources.models import AirbyteSettings
22
+ from flowfile_core.flowfile.sources.external_sources.sql_source.models import (DatabaseExternalReadSettings,
23
+ DatabaseExternalWriteSettings)
24
+ from flowfile_core.schemas.input_schema import (
25
+ ReceivedCsvTable,
26
+ ReceivedExcelTable,
27
+ ReceivedJsonTable,
28
+ ReceivedParquetTable
29
+ )
30
+ from flowfile_core.utils.arrow_reader import read
31
+
32
+ ReceivedTableCollection = ReceivedCsvTable | ReceivedParquetTable | ReceivedJsonTable | ReceivedExcelTable
33
+
34
+
35
+ def trigger_df_operation(flow_id: int, node_id: int | str, lf: pl.LazyFrame, file_ref: str, operation_type: OperationType = 'store') -> Status:
36
+ encoded_operation = encodebytes(lf.serialize()).decode()
37
+ _json = {'task_id': file_ref, 'operation': encoded_operation, 'operation_type': operation_type,
38
+ 'flowfile_flow_id': flow_id, 'flowfile_node_id': node_id}
39
+ v = requests.post(url=f'{WORKER_URL}/submit_query/', json=_json)
40
+ if not v.ok:
41
+ raise Exception(f'Could not cache the data, {v.text}')
42
+ return Status(**v.json())
43
+
44
+
45
+ def trigger_sample_operation(lf: pl.LazyFrame, file_ref: str, flow_id: int, node_id: str | int, sample_size: int = 100) -> Status:
46
+ encoded_operation = encodebytes(lf.serialize()).decode()
47
+ _json = {'task_id': file_ref, 'operation': encoded_operation, 'operation_type': 'store_sample',
48
+ 'sample_size': sample_size, 'flowfile_flow_id': flow_id, 'flowfile_node_id': node_id}
49
+ v = requests.post(url=f'{WORKER_URL}/store_sample/', json=_json)
50
+ if not v.ok:
51
+ raise Exception(f'Could not cache the data, {v.text}')
52
+ return Status(**v.json())
53
+
54
+
55
+ def trigger_fuzzy_match_operation(left_df: pl.LazyFrame, right_df: pl.LazyFrame,
56
+ fuzzy_maps: List[FuzzyMap],
57
+ file_ref: str,
58
+ flow_id: int,
59
+ node_id: int | str) -> Status:
60
+ left_serializable_object = PolarsOperation(operation=encodebytes(left_df.serialize()))
61
+ right_serializable_object = PolarsOperation(operation=encodebytes(right_df.serialize()))
62
+ fuzzy_join_input = FuzzyJoinInput(left_df_operation=left_serializable_object,
63
+ right_df_operation=right_serializable_object,
64
+ fuzzy_maps=fuzzy_maps,
65
+ task_id=file_ref,
66
+ flowfile_flow_id=flow_id,
67
+ flowfile_node_id=node_id
68
+ )
69
+ v = requests.post(f'{WORKER_URL}/add_fuzzy_join', data=fuzzy_join_input.model_dump_json())
70
+ if not v.ok:
71
+ raise Exception(f'Could not cache the data, {v.text}')
72
+ return Status(**v.json())
73
+
74
+
75
+ def trigger_create_operation(flow_id: int, node_id: int | str, received_table: ReceivedTableCollection,
76
+ file_type: str = Literal['csv', 'parquet', 'json', 'excel']):
77
+ f = requests.post(url=f'{WORKER_URL}/create_table/{file_type}', data=received_table.model_dump_json(),
78
+ params={'flowfile_flow_id': flow_id, 'flowfile_node_id': node_id})
79
+ if not f.ok:
80
+ raise Exception(f'Could not cache the data, {f.text}')
81
+ return Status(**f.json())
82
+
83
+
84
+ def trigger_airbyte_collector(airbyte_settings: AirbyteSettings):
85
+ f = requests.post(url=f'{WORKER_URL}/store_airbyte_result', data=airbyte_settings.model_dump_json())
86
+ if not f.ok:
87
+ raise Exception(f'Could not cache the data, {f.text}')
88
+ return Status(**f.json())
89
+
90
+
91
+ def trigger_database_read_collector(database_external_read_settings: DatabaseExternalReadSettings):
92
+ f = requests.post(url=f'{WORKER_URL}/store_database_read_result',
93
+ data=database_external_read_settings.model_dump_json())
94
+ if not f.ok:
95
+ raise Exception(f'Could not cache the data, {f.text}')
96
+ return Status(**f.json())
97
+
98
+
99
+ def trigger_database_write(database_external_write_settings: DatabaseExternalWriteSettings):
100
+ f = requests.post(url=f'{WORKER_URL}/store_database_write_result',
101
+ data=database_external_write_settings.model_dump_json())
102
+ if not f.ok:
103
+ raise Exception(f'Could not cache the data, {f.text}')
104
+ return Status(**f.json())
105
+
106
+
107
+ def get_results(file_ref: str) -> Status | None:
108
+ f = requests.get(f'{WORKER_URL}/status/{file_ref}')
109
+ if f.status_code == 200:
110
+ return Status(**f.json())
111
+ else:
112
+ raise Exception(f'Could not fetch the data, {f.text}')
113
+
114
+
115
+ def results_exists(file_ref: str):
116
+ f = requests.get(f'{WORKER_URL}/status/{file_ref}')
117
+ if f.status_code == 200:
118
+ if f.json()['status'] == 'Completed':
119
+ return True
120
+ return False
121
+
122
+
123
+ def get_df_result(encoded_df: str) -> pl.LazyFrame:
124
+ r = decodebytes(encoded_df.encode())
125
+ return pl.LazyFrame.deserialize(io.BytesIO(r))
126
+
127
+
128
+ def get_external_df_result(file_ref: str) -> pl.LazyFrame | None:
129
+ status = get_results(file_ref)
130
+ if status.status != 'Completed':
131
+ raise Exception(f"Status is not completed, {status.status}")
132
+ if status.result_type == 'polars':
133
+ return get_df_result(status.results)
134
+ else:
135
+ raise Exception(f"Result type is not polars, {status.result_type}")
136
+
137
+
138
+ def get_status(file_ref: str) -> Status:
139
+ status_response = requests.get(f'{WORKER_URL}/status/{file_ref}')
140
+ if status_response.status_code == 200:
141
+ return Status(**status_response.json())
142
+ else:
143
+ raise Exception(f"Could not fetch the status, {status_response.text}")
144
+
145
+
146
+ def cancel_task(file_ref: str) -> bool:
147
+ """
148
+ Cancels a running task by making a request to the worker service.
149
+
150
+ Args:
151
+ file_ref: The unique identifier of the task to cancel
152
+
153
+ Returns:
154
+ bool: True if cancellation was successful, False otherwise
155
+
156
+ Raises:
157
+ Exception: If there's an error communicating with the worker service
158
+ """
159
+ try:
160
+ response = requests.post(f'{WORKER_URL}/cancel_task/{file_ref}')
161
+ if response.ok:
162
+ return True
163
+ return False
164
+ except requests.RequestException as e:
165
+ raise Exception(f'Failed to cancel task: {str(e)}')
166
+
167
+
168
+ class BaseFetcher:
169
+ result: Optional[Any] = None
170
+ started: bool = False
171
+ running: bool = False
172
+ error_code: int = 0
173
+ error_description: Optional[str] = None
174
+ file_ref: Optional[str] = None
175
+
176
+ def __init__(self, file_ref: str = None):
177
+ self.file_ref = file_ref if file_ref else str(uuid4())
178
+ self.stop_event = threading.Event()
179
+ self.thread = threading.Thread(target=self._fetch_cached_df)
180
+ self.result = None
181
+ self.error_description = None
182
+ self.running = False
183
+ self.started = False
184
+ self.condition = threading.Condition()
185
+ self.error_code = 0
186
+
187
+ def _fetch_cached_df(self):
188
+ with self.condition:
189
+ if self.running:
190
+ logger.info('Already running the fetching')
191
+ return
192
+
193
+ sleep_time = 1
194
+ self.running = True
195
+ while not self.stop_event.is_set():
196
+ try:
197
+ r = requests.get(f'{WORKER_URL}/status/{self.file_ref}')
198
+ if r.status_code == 200:
199
+ status = Status(**r.json())
200
+ if status.status == 'Completed':
201
+ self._handle_completion(status)
202
+ return
203
+ elif status.status == 'Error':
204
+ self._handle_error(1, status.error_message)
205
+ break
206
+ elif status.status == 'Unknown Error':
207
+ self._handle_error(-1,
208
+ 'There was an unknown error with the process, and the process got killed by the server')
209
+ break
210
+ else:
211
+ self._handle_error(2, r.text)
212
+ break
213
+ except requests.RequestException as e:
214
+ self._handle_error(2, f"Request failed: {e}")
215
+ break
216
+
217
+ sleep(sleep_time)
218
+
219
+ self._handle_cancellation()
220
+
221
+ def _handle_completion(self, status):
222
+ self.running = False
223
+ self.condition.notify_all()
224
+ if status.result_type == 'polars':
225
+ self.result = get_df_result(status.results)
226
+ else:
227
+ self.result = status.results
228
+
229
+ def _handle_error(self, code, description):
230
+ self.error_code = code
231
+ self.error_description = description
232
+ self.running = False
233
+ self.condition.notify_all()
234
+
235
+ def _handle_cancellation(self):
236
+ logger.warning("Fetch operation cancelled")
237
+ if self.error_description is not None:
238
+ logger.warning(self.error_description)
239
+ self.running = False
240
+ self.condition.notify_all()
241
+
242
+ def start(self):
243
+ if self.running:
244
+ logger.info('Already running the fetching')
245
+ return
246
+ if not self.started:
247
+ self.thread.start()
248
+ self.started = True
249
+
250
+ def cancel(self):
251
+ """
252
+ Cancels the current task both locally and on the worker service.
253
+ Also cleans up any resources being used.
254
+ """
255
+ logger.warning('Cancelling the operation')
256
+ try:
257
+ cancel_task(self.file_ref)
258
+ except Exception as e:
259
+ logger.error(f'Failed to cancel task on worker: {str(e)}')
260
+
261
+ # Then stop the local monitoring thread
262
+ self.stop_event.set()
263
+ self.thread.join()
264
+
265
+ # Update local state
266
+ with self.condition:
267
+ self.running = False
268
+ self.error_description = "Task cancelled by user"
269
+ self.condition.notify_all()
270
+
271
+ def get_result(self) -> Optional[Any]:
272
+ if not self.started:
273
+ self.start()
274
+ with self.condition:
275
+ while self.running and self.result is None:
276
+ self.condition.wait() # Wait until notified
277
+ if self.error_description is not None:
278
+ raise Exception(self.error_description)
279
+ return self.result
280
+
281
+
282
+ class ExternalDfFetcher(BaseFetcher):
283
+ status: Optional[Status] = None
284
+
285
+ def __init__(self, flow_id: int, node_id: int | str, lf: pl.LazyFrame | pl.DataFrame, file_ref: str = None,
286
+ wait_on_completion: bool = True,
287
+ operation_type: OperationType = 'store'):
288
+ super().__init__(file_ref=file_ref)
289
+ lf = lf.lazy() if isinstance(lf, pl.DataFrame) else lf
290
+ r = trigger_df_operation(lf=lf, file_ref=self.file_ref, operation_type=operation_type,
291
+ node_id=node_id, flow_id=flow_id)
292
+ self.running = r.status == 'Processing'
293
+ if wait_on_completion:
294
+ _ = self.get_result()
295
+ self.status = get_status(self.file_ref)
296
+
297
+
298
+ class ExternalSampler(BaseFetcher):
299
+ status: Optional[Status] = None
300
+
301
+ def __init__(self, lf: pl.LazyFrame | pl.DataFrame, node_id: str | int, flow_id: int, file_ref: str = None, wait_on_completion: bool = True,
302
+ sample_size: int = 100):
303
+ super().__init__(file_ref=file_ref)
304
+ lf = lf.lazy() if isinstance(lf, pl.DataFrame) else lf
305
+ r = trigger_sample_operation(lf=lf, file_ref=file_ref, sample_size=sample_size, node_id=node_id, flow_id=flow_id)
306
+ self.running = r.status == 'Processing'
307
+ if wait_on_completion:
308
+ _ = self.get_result()
309
+ self.status = get_status(self.file_ref)
310
+
311
+
312
+ class ExternalFuzzyMatchFetcher(BaseFetcher):
313
+ def __init__(self, left_df: pl.LazyFrame, right_df: pl.LazyFrame, fuzzy_maps: List[Any], flow_id: int,
314
+ node_id: int | str,
315
+ file_ref: str = None,
316
+ wait_on_completion: bool = True):
317
+ super().__init__(file_ref=file_ref)
318
+
319
+ r = trigger_fuzzy_match_operation(left_df=left_df, right_df=right_df, fuzzy_maps=fuzzy_maps,
320
+ file_ref=file_ref, flow_id=flow_id, node_id=node_id)
321
+ self.file_ref = r.background_task_id
322
+ self.running = r.status == 'Processing'
323
+ if wait_on_completion:
324
+ _ = self.get_result()
325
+
326
+
327
+ class ExternalCreateFetcher(BaseFetcher):
328
+ def __init__(self, received_table: ReceivedTableCollection, node_id: int, flow_id: int,
329
+ file_type: str = 'csv', wait_on_completion: bool = True):
330
+ r = trigger_create_operation(received_table=received_table, file_type=file_type,
331
+ node_id=node_id, flow_id=flow_id)
332
+ super().__init__(file_ref=r.background_task_id)
333
+ self.running = r.status == 'Processing'
334
+ if wait_on_completion:
335
+ _ = self.get_result()
336
+
337
+
338
+ class ExternalAirbyteFetcher(BaseFetcher):
339
+ def __init__(self, airbyte_settings: AirbyteSettings, wait_on_completion: bool = True):
340
+ r = trigger_airbyte_collector(airbyte_settings)
341
+ super().__init__(file_ref=r.background_task_id)
342
+ self.running = r.status == 'Processing'
343
+ if wait_on_completion:
344
+ _ = self.get_result()
345
+
346
+
347
+ class ExternalDatabaseFetcher(BaseFetcher):
348
+ def __init__(self, database_external_read_settings: DatabaseExternalReadSettings,
349
+ wait_on_completion: bool = True):
350
+ r = trigger_database_read_collector(database_external_read_settings=database_external_read_settings)
351
+ super().__init__(file_ref=r.background_task_id)
352
+ self.running = r.status == 'Processing'
353
+ if wait_on_completion:
354
+ _ = self.get_result()
355
+
356
+
357
+ class ExternalDatabaseWriter(BaseFetcher):
358
+ def __init__(self, database_external_write_settings: DatabaseExternalWriteSettings,
359
+ wait_on_completion: bool = True):
360
+ r = trigger_database_write(database_external_write_settings=database_external_write_settings)
361
+ super().__init__(file_ref=r.background_task_id)
362
+ self.running = r.status == 'Processing'
363
+ if wait_on_completion:
364
+ _ = self.get_result()
365
+
366
+
367
+ class ExternalExecutorTracker:
368
+ result: Optional[pl.LazyFrame]
369
+ started: bool = False
370
+ running: bool = False
371
+ error_code: int = 0
372
+ error_description: Optional[str] = None
373
+ file_ref: str = None
374
+
375
+ def __init__(self, initial_response: Status, wait_on_completion: bool = True):
376
+ self.file_ref = initial_response.background_task_id
377
+ self.stop_event = threading.Event()
378
+ self.thread = threading.Thread(target=self._fetch_cached_df)
379
+ self.result = None
380
+ self.error_description = None
381
+ self.running = initial_response.status == 'Processing'
382
+ self.condition = threading.Condition()
383
+ if wait_on_completion:
384
+ _ = self.get_result()
385
+
386
+ def _fetch_cached_df(self):
387
+ with self.condition:
388
+ if self.running:
389
+ logger.info('Already running the fetching')
390
+ return
391
+ sleep_time = 1
392
+ self.running = True
393
+ while not self.stop_event.is_set():
394
+ try:
395
+ r = requests.get(f'{WORKER_URL}/status/{self.file_ref}')
396
+ if r.status_code == 200:
397
+ status = Status(**r.json())
398
+ if status.status == 'Completed':
399
+ self.running = False
400
+ self.condition.notify_all() # Notify all waiting threads
401
+ if status.result_type == 'polars':
402
+ self.result = get_df_result(status.results)
403
+ else:
404
+ self.result = status.results
405
+ return
406
+ elif status.status == 'Error':
407
+ self.error_code = 1
408
+ self.error_description = status.error_message
409
+ break
410
+ elif status.status == 'Unknown Error':
411
+ self.error_code = -1
412
+ self.error_description = 'There was an unknown error with the process, and the process got killed by the server'
413
+ break
414
+ else:
415
+ self.error_description = r.text
416
+ self.error_code = 2
417
+ break
418
+ except requests.RequestException as e:
419
+ self.error_code = 2
420
+ self.error_description = f"Request failed: {e}"
421
+ break
422
+
423
+ sleep(sleep_time)
424
+ # logger.info('Fetching the data')
425
+
426
+ logger.warning("Fetch operation cancelled")
427
+ if self.error_description is not None:
428
+ self.running = False
429
+ logger.warning(self.error_description)
430
+ self.condition.notify_all()
431
+ return
432
+
433
+ def start(self):
434
+ self.started = True
435
+ if self.running:
436
+ logger.info('Already running the fetching')
437
+ return
438
+ self.thread.start()
439
+
440
+ def cancel(self):
441
+ logger.warning('Cancelling the operation')
442
+ self.thread.join()
443
+
444
+ self.running = False
445
+
446
+ def get_result(self) -> pl.LazyFrame | Any | None:
447
+ if not self.started:
448
+ self.start()
449
+ with self.condition:
450
+ while self.running and self.result is None:
451
+ self.condition.wait() # Wait until notified
452
+ if self.error_description is not None:
453
+ raise Exception(self.error_description)
454
+ return self.result
455
+
456
+
457
+ def fetch_unique_values(lf: pl.LazyFrame) -> List[str]:
458
+ """
459
+ Fetches unique values from a specified column in a LazyFrame, attempting first via an external fetcher
460
+ and falling back to direct LazyFrame computation if that fails.
461
+
462
+ Args:
463
+ lf: A Polars LazyFrame containing the data
464
+ column: Name of the column to extract unique values from
465
+
466
+ Returns:
467
+ List[str]: List of unique values from the specified column cast to strings
468
+
469
+ Raises:
470
+ ValueError: If no unique values are found or if the fetch operation fails
471
+
472
+ Example:
473
+ >>> lf = pl.LazyFrame({'category': ['A', 'B', 'A', 'C']})
474
+ >>> unique_vals = fetch_unique_values(lf)
475
+ >>> print(unique_vals)
476
+ ['A', 'B', 'C']
477
+ """
478
+ try:
479
+ # Try external source first if lf is provided
480
+ try:
481
+ external_df_fetcher = ExternalDfFetcher(lf=lf, flow_id=1, node_id=-1)
482
+ if external_df_fetcher.status.status == 'Completed':
483
+
484
+ unique_values = read(external_df_fetcher.status.file_ref).column(0).to_pylist()
485
+ if logger:
486
+ logger.info(f"Got {len(unique_values)} unique values from external source")
487
+ return unique_values
488
+ except Exception as e:
489
+ if logger:
490
+ logger.debug(f"Failed reading external file: {str(e)}")
491
+
492
+ unique_values = (lf.unique().collect(engine="streaming")[:, 0].to_list())
493
+
494
+ if not unique_values:
495
+ raise ValueError(f"No unique values found in lazyframe")
496
+
497
+ return unique_values
498
+
499
+ except Exception as e:
500
+ error_msg = f"Failed to fetch unique values: {str(e)}"
501
+ if logger:
502
+ logger.error(error_msg)
503
+ raise ValueError(error_msg) from e
@@ -0,0 +1,27 @@
1
+
2
+ import polars as pl
3
+ from flowfile_core.utils.fl_executor import process_executor
4
+ from flowfile_core.flowfile.flow_data_engine import utils
5
+
6
+ # calculate_schema_threaded = process_executor(wait_on_completion=True, max_workers=1)(utils.calculate_schema)
7
+ write_threaded = process_executor(False, max_workers=1)(utils.write_polars_frame)
8
+ collect_threaded = process_executor(wait_on_completion=False, max_workers=1)(utils.collect)
9
+ cache_polars_frame_to_temp_thread = process_executor(wait_on_completion=True, max_workers=1)(
10
+ utils.cache_polars_frame_to_temp)
11
+
12
+
13
+ @process_executor(False, max_workers=1)
14
+ def do_something_random():
15
+ print('10 seconds')
16
+
17
+
18
+ # @process_executor(False, max_workers=1)
19
+ def get_join_count(left: pl.LazyFrame, right: pl.LazyFrame, left_on_keys, right_on_keys, how):
20
+ left_joined_df = left.group_by(left_on_keys).count()
21
+ right_joined_df = right.group_by(right_on_keys).count()
22
+ data: pl.LazyFrame = left_joined_df.join(right_joined_df, left_on=left_on_keys,
23
+ right_on=right_on_keys, how=how)
24
+ data = data.with_columns(pl.lit(1).alias('total').cast(pl.UInt64))
25
+ result = data.select((pl.col('total') * pl.col('count') * pl.col('count_right'))).sum()
26
+ n_records = result.collect().to_series().to_list()[0]
27
+ return n_records
File without changes