Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,771 @@
1
+
2
+ from typing import List, Union, Callable, Any, Optional, Generator, Literal
3
+ from flowfile_core.configs import logger
4
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
5
+ from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
6
+ from flowfile_core.utils.arrow_reader import get_read_top_n
7
+ from flowfile_core.schemas import input_schema, schemas
8
+ from flowfile_core.configs.flow_logger import NodeLogger
9
+
10
+ from flowfile_core.schemas.output_model import TableExample, FileColumn, NodeData
11
+ from flowfile_core.flowfile.utils import get_hash
12
+ from flowfile_core.configs.node_store import nodes as node_interface
13
+ from flowfile_core.flowfile.setting_generator import setting_generator, setting_updator
14
+ from time import sleep
15
+ from flowfile_core.flowfile.flow_data_engine.subprocess_operations import (
16
+ ExternalDfFetcher, ExternalSampler, results_exists, get_external_df_result, ExternalDatabaseFetcher, ExternalDatabaseWriter)
17
+ from flowfile_core.flowfile.flow_node.models import (NodeStepSettings, NodeStepInputs, NodeSchemaInformation,
18
+ NodeStepStats, NodeResults)
19
+ from flowfile_core.flowfile.flow_node.schema_callback import SingleExecutionFuture
20
+
21
+
22
+ class FlowNode:
23
+ parent_uuid: str
24
+ node_type: str
25
+ node_template: node_interface.NodeTemplate
26
+ node_default: schemas.NodeDefault
27
+ node_schema: NodeSchemaInformation
28
+ node_inputs: NodeStepInputs
29
+ node_stats: NodeStepStats
30
+ node_settings: NodeStepSettings
31
+ results: NodeResults
32
+ node_information: Optional[schemas.NodeInformation] = None
33
+ leads_to_nodes: List["FlowNode"] = [] # list with target flows, after execution the step will trigger those step(s)
34
+ _setting_input: Any = None
35
+ _hash: Optional[str] = None # host this for caching results
36
+ _function: Callable = None # the function that needs to be executed when triggered
37
+ _schema_callback: Optional[SingleExecutionFuture] = None # Function that calculates the schema without executing
38
+ _state_needs_reset: bool = False
39
+ _fetch_cached_df: Optional[ExternalDfFetcher | ExternalDatabaseFetcher | ExternalDatabaseWriter] = None
40
+ _cache_progress: Optional[ExternalDfFetcher | ExternalDatabaseFetcher | ExternalDatabaseWriter] = None
41
+
42
+ def post_init(self):
43
+ self.node_inputs = NodeStepInputs()
44
+ self.node_stats = NodeStepStats()
45
+ self.node_settings = NodeStepSettings()
46
+ self.node_schema = NodeSchemaInformation()
47
+ self.results = NodeResults()
48
+ self.node_information = schemas.NodeInformation()
49
+ self.leads_to_nodes = []
50
+ self._setting_input = None
51
+ self._cache_progress = None
52
+ self._schema_callback = None
53
+ self._state_needs_reset = False
54
+
55
+ @property
56
+ def state_needs_reset(self):
57
+ return self._state_needs_reset
58
+
59
+ @state_needs_reset.setter
60
+ def state_needs_reset(self, v: bool):
61
+ self._state_needs_reset = v
62
+
63
+ @property
64
+ def schema_callback(self):
65
+ return self._schema_callback
66
+
67
+ @schema_callback.setter
68
+ def schema_callback(self, f: Callable):
69
+ if f is None:
70
+ return
71
+
72
+ def error_callback(e: Exception) -> List:
73
+ logger.warning(e)
74
+
75
+ self.node_settings.setup_errors = True
76
+ return []
77
+
78
+ self._schema_callback = SingleExecutionFuture(f, error_callback)
79
+ self._schema_callback.start()
80
+
81
+ @property
82
+ def is_start(self) -> bool:
83
+ return not self.has_input and self.node_template.input == 0
84
+
85
+ def get_input_type(self, node_id: int) -> List:
86
+ relation_type = []
87
+ if node_id in [n.node_id for n in self.node_inputs.main_inputs]:
88
+ relation_type.append('main')
89
+ if self.node_inputs.left_input is not None and node_id == self.node_inputs.left_input.node_id:
90
+ relation_type.append('left')
91
+ if self.node_inputs.right_input is not None and node_id == self.node_inputs.right_input.node_id:
92
+ relation_type.append('right')
93
+ return list(set(relation_type))
94
+
95
+ def __init__(self, node_id: Union[str, int], function: Callable,
96
+ parent_uuid: str,
97
+ setting_input: Any,
98
+ name: str,
99
+ node_type: str,
100
+ input_columns: List[str] = None,
101
+ output_schema: List[FlowfileColumn] = None,
102
+ drop_columns: List[str] = None,
103
+ renew_schema: bool = True,
104
+ pos_x: float = 0,
105
+ pos_y: float = 0,
106
+ schema_callback: Callable = None,
107
+ ):
108
+ self.parent_uuid = parent_uuid
109
+ self.post_init()
110
+ self.active = True
111
+ self.node_information.id = node_id
112
+ self.node_type = node_type
113
+ self.node_settings.renew_schema = renew_schema
114
+ self.update_node(function=function,
115
+ input_columns=input_columns,
116
+ output_schema=output_schema,
117
+ drop_columns=drop_columns,
118
+ setting_input=setting_input,
119
+ name=name,
120
+ pos_x=pos_x,
121
+ pos_y=pos_y,
122
+ schema_callback=schema_callback,
123
+ )
124
+
125
+ def update_node(self,
126
+ function: Callable,
127
+ input_columns: List[str] = None,
128
+ output_schema: List[FlowfileColumn] = None,
129
+ drop_columns: List[str] = None,
130
+ name: str = None,
131
+ setting_input: Any = None,
132
+ pos_x: float = 0,
133
+ pos_y: float = 0,
134
+ schema_callback: Callable = None,
135
+ ):
136
+
137
+ self.schema_callback = schema_callback
138
+ self.node_information.y_position = pos_y
139
+ self.node_information.x_position = pos_x
140
+ self.node_information.setting_input = setting_input
141
+ self.name = self.node_type if name is None else name
142
+ self._function = function
143
+ self.node_schema.input_columns = [] if input_columns is None else input_columns
144
+ self.node_schema.output_columns = [] if output_schema is None else output_schema
145
+ self.node_schema.drop_columns = [] if drop_columns is None else drop_columns
146
+ self.node_settings.renew_schema = True
147
+ if hasattr(setting_input, 'cache_results'):
148
+ self.node_settings.cache_results = setting_input.cache_results
149
+ self.setting_input = setting_input
150
+ self.results.errors = None
151
+ self.add_lead_to_in_depend_source()
152
+ _ = self.hash
153
+ self.node_template = node_interface.node_dict.get(self.node_type)
154
+ if self.node_template is None:
155
+ raise Exception(f'Node template {self.node_type} not found')
156
+ self.node_default = node_interface.node_defaults.get(self.node_type)
157
+
158
+ @property
159
+ def name(self):
160
+ return self._name
161
+
162
+ @name.setter
163
+ def name(self, name: str):
164
+ self._name = name
165
+ self.__name__ = name
166
+
167
+ @property
168
+ def setting_input(self):
169
+ return self._setting_input
170
+
171
+ @setting_input.setter
172
+ def setting_input(self, setting_input: Any):
173
+ self._setting_input = setting_input
174
+ self.set_node_information()
175
+ if self.node_type == 'manual_input' and isinstance(self._setting_input, input_schema.NodeManualInput):
176
+ if self.hash != self.calculate_hash(setting_input) or not self.node_stats.has_run:
177
+ self.function = self.function.__class__(setting_input.raw_data)
178
+ self.reset()
179
+ self.get_predicted_schema()
180
+ elif self._setting_input is not None:
181
+ self.reset()
182
+
183
+ @property
184
+ def node_id(self):
185
+ return self.node_information.id
186
+
187
+ @property
188
+ def left_input(self):
189
+ return self.node_inputs.left_input
190
+
191
+ @property
192
+ def right_input(self):
193
+ return self.node_inputs.right_input
194
+
195
+ @property
196
+ def main_input(self) -> List["FlowNode"]:
197
+ return self.node_inputs.main_inputs
198
+
199
+ @property
200
+ def is_correct(self):
201
+ # Check if inputs meet requirements
202
+ if isinstance(self.setting_input, input_schema.NodePromise):
203
+ return False
204
+ return (self.node_template.input == len(self.node_inputs.get_all_inputs()) or
205
+ (self.node_template.multi and len(self.node_inputs.get_all_inputs()) > 0) or
206
+ (self.node_template.multi and self.node_template.can_be_start))
207
+
208
+ def set_node_information(self):
209
+ logger.info('setting node information')
210
+ node_information = self.node_information
211
+ node_information.left_input_id = self.node_inputs.left_input.node_id if self.left_input else None
212
+ node_information.right_input_id = self.node_inputs.right_input.node_id if self.right_input else None
213
+ node_information.input_ids = [mi.node_id for mi in
214
+ self.node_inputs.main_inputs] if self.node_inputs.main_inputs is not None else None
215
+ node_information.setting_input = self.setting_input
216
+ node_information.outputs = [n.node_id for n in self.leads_to_nodes]
217
+ node_information.is_setup = self.is_setup
218
+ node_information.x_position = self.setting_input.pos_x
219
+ node_information.y_position = self.setting_input.pos_y
220
+ node_information.type = self.node_type
221
+
222
+ def get_node_information(self) -> schemas.NodeInformation:
223
+ self.set_node_information()
224
+ return self.node_information
225
+
226
+ @property
227
+ def function(self):
228
+ return self._function
229
+
230
+ def reset_hash(self) -> bool:
231
+ old_hash = self._hash
232
+ self._hash = None
233
+ if self.hash != old_hash:
234
+ if self.node_settings.cache_results:
235
+ self.remove_cache()
236
+ return True
237
+ return False
238
+
239
+ @property
240
+ def all_inputs(self) -> List["FlowNode"]:
241
+ return self.node_inputs.get_all_inputs()
242
+
243
+ def calculate_hash(self, setting_input: Any):
244
+ depends_on_hashes = [_node.hash for _node in self.all_inputs]
245
+ node_data_hash = get_hash(setting_input)
246
+ return get_hash(depends_on_hashes + [node_data_hash, self.parent_uuid])
247
+
248
+ @property
249
+ def hash(self):
250
+ if not self._hash:
251
+ self._hash = self.calculate_hash(self.setting_input)
252
+ return self._hash
253
+
254
+ @function.setter
255
+ def function(self, function: Callable):
256
+ self._function = function
257
+ # self.reset()
258
+
259
+ def add_node_connection(self, from_node: "FlowNode", insert_type: Literal['main', 'left', 'right'] = 'main'):
260
+ from_node.leads_to_nodes.append(self)
261
+ if insert_type == 'main':
262
+ if self.node_template.input <= 2 or self.node_inputs.main_inputs is None:
263
+ self.node_inputs.main_inputs = [from_node]
264
+ else:
265
+ self.node_inputs.main_inputs.append(from_node)
266
+ elif insert_type == 'right':
267
+ self.node_inputs.right_input = from_node
268
+ elif insert_type == 'left':
269
+ self.node_inputs.left_input = from_node
270
+ else:
271
+ raise Exception('Cannot find the connection')
272
+ if self.setting_input.is_setup:
273
+ if hasattr(self.setting_input, 'depending_on_id') and insert_type == 'main':
274
+ self.setting_input.depending_on_id = from_node.node_id
275
+ self.reset()
276
+ from_node.reset()
277
+
278
+ def evaluate_nodes(self, deep: bool = False):
279
+ for node in self.leads_to_nodes:
280
+ self.print(f'resetting node: {node.node_id}')
281
+ node.reset(deep)
282
+
283
+ def get_flow_file_column_schema(self, col_name: str) -> FlowfileColumn:
284
+ for s in self.schema:
285
+ if s.column_name == col_name:
286
+ return s
287
+
288
+ def get_predicted_schema(self, force: bool = False):
289
+ """
290
+ Method to get a predicted schema based on the columns that are dropped and added
291
+ :return:
292
+ """
293
+ if self.node_schema.predicted_schema is not None and not force:
294
+ return self.node_schema.predicted_schema
295
+ if self.schema_callback is not None and (self.node_schema.predicted_schema is None or force):
296
+ self.print('Getting the data from a schema callback')
297
+ if force:
298
+ # Force the schema callback to reset, so that it will be executed again
299
+ self.schema_callback.reset()
300
+ schema = self.schema_callback()
301
+ if schema is not None:
302
+ self.print('Calculating the schema based on the schema callback')
303
+ self.node_schema.predicted_schema = schema
304
+ return self.node_schema.predicted_schema
305
+ predicted_data = self._predicted_data_getter()
306
+ if predicted_data is not None and predicted_data.schema is not None:
307
+ self.print('Calculating the schema based on the predicted resulting data')
308
+ self.node_schema.predicted_schema = self._predicted_data_getter().schema
309
+ return self.node_schema.predicted_schema
310
+
311
+ @property
312
+ def is_setup(self) -> bool:
313
+ if not self.node_information.is_setup:
314
+ if self.function.__name__ != 'placeholder':
315
+ self.node_information.is_setup = True
316
+ self.setting_input.is_setup = True
317
+ return self.node_information.is_setup
318
+
319
+ def print(self, v: Any):
320
+ logger.info(f'{self.node_type}, node_id: {self.node_id}: {v}')
321
+
322
+ def get_resulting_data(self) -> FlowDataEngine:
323
+ if self.is_setup:
324
+ if self.results.resulting_data is None and self.results.errors is None:
325
+ self.print('getting resulting data')
326
+ try:
327
+ if isinstance(self.function, FlowDataEngine):
328
+ fl: FlowDataEngine = self.function
329
+ elif self.node_type in ('external_source', 'airbyte_reader'):
330
+ fl: FlowDataEngine = self.function()
331
+ fl.collect_external()
332
+ self.node_settings.streamable = False
333
+ else:
334
+ try:
335
+ fl = self._function(*[v.get_resulting_data() for v in self.all_inputs])
336
+ except Exception as e:
337
+ raise e
338
+ fl.set_streamable(self.node_settings.streamable)
339
+ self.results.resulting_data = fl
340
+ self.node_schema.result_schema = fl.schema
341
+ except Exception as e:
342
+ self.results.resulting_data = FlowDataEngine()
343
+ self.results.errors = str(e)
344
+ self.node_stats.has_run = False
345
+ raise e
346
+ return self.results.resulting_data
347
+
348
+ def _predicted_data_getter(self) -> FlowDataEngine|None:
349
+ try:
350
+ fl = self._function(*[v.get_predicted_resulting_data() for v in self.all_inputs])
351
+ return fl
352
+ except ValueError as e:
353
+ if str(e) == "generator already executing":
354
+ logger.info('Generator already executing, waiting for the result')
355
+ sleep(1)
356
+ return self._predicted_data_getter()
357
+ fl = FlowDataEngine()
358
+ return fl
359
+
360
+ except Exception as e:
361
+ logger.warning('there was an issue with the function, returning an empty Flowfile')
362
+ logger.warning(e)
363
+
364
+ def get_predicted_resulting_data(self) -> FlowDataEngine:
365
+ if self.needs_run(False) and self.schema_callback is not None or self.node_schema.result_schema is not None:
366
+ self.print('Getting data based on the schema')
367
+ _s = self.schema_callback() if self.node_schema.result_schema is None else self.node_schema.result_schema
368
+ return FlowDataEngine.create_from_schema(_s)
369
+ else:
370
+ if isinstance(self.function, FlowDataEngine):
371
+ fl = self.function
372
+ else:
373
+ fl = FlowDataEngine.create_from_schema(self.get_predicted_schema())
374
+ return fl
375
+
376
+ def add_lead_to_in_depend_source(self):
377
+ for input_node in self.all_inputs:
378
+ if self.node_id not in [n.node_id for n in input_node.leads_to_nodes]:
379
+ input_node.leads_to_nodes.append(self)
380
+
381
+ def get_all_dependent_nodes(self) -> Generator["FlowNode", None, None]:
382
+ for node in self.leads_to_nodes:
383
+ yield node
384
+ for n in node.get_all_dependent_nodes():
385
+ yield n
386
+
387
+ def get_all_dependent_node_ids(self) -> Generator[int, None, None]:
388
+ for node in self.leads_to_nodes:
389
+ yield node.node_id
390
+ for n in node.get_all_dependent_node_ids():
391
+ yield n
392
+
393
+ @property
394
+ def schema(self) -> List[FlowfileColumn]:
395
+ try:
396
+ if self.is_setup and self.results.errors is None:
397
+ if self.node_schema.result_schema is not None and len(self.node_schema.result_schema) > 0:
398
+ return self.node_schema.result_schema
399
+ elif self.node_type == 'output':
400
+ if len(self.node_inputs.main_inputs) > 0:
401
+ self.node_schema.result_schema = self.node_inputs.main_inputs[0].schema
402
+ else:
403
+ self.node_schema.result_schema = self.get_predicted_schema()
404
+ return self.node_schema.result_schema
405
+ else:
406
+ return []
407
+ except:
408
+ return []
409
+
410
+ def load_from_cache(self) -> FlowDataEngine:
411
+ if results_exists(self.hash):
412
+ try:
413
+ return FlowDataEngine(self._fetch_cached_df.get_result())
414
+ except Exception as e:
415
+ logger.error(e)
416
+
417
+ def remove_cache(self):
418
+ if results_exists(self.hash):
419
+ logger.warning('Not implemented')
420
+
421
+ def needs_run(self, performance_mode: bool, node_logger: NodeLogger = None,
422
+ execution_location: schemas.ExecutionLocationsLiteral = "auto") -> bool:
423
+ if execution_location == "local":
424
+ return False
425
+ flow_logger = logger if node_logger is None else node_logger
426
+ cache_result_exists = results_exists(self.hash)
427
+ if not self.node_stats.has_run:
428
+ flow_logger.info('Node has not run, needs to run')
429
+ return True
430
+ if self.node_settings.cache_results and cache_result_exists:
431
+
432
+ return False
433
+ elif self.node_settings.cache_results and not cache_result_exists:
434
+ return True
435
+ elif not performance_mode and cache_result_exists:
436
+ return False
437
+ else:
438
+ return True
439
+
440
+ def __call__(self, *args, **kwargs):
441
+ self.execute_node(*args, **kwargs)
442
+
443
+ def execute_local(self, flow_id: int, performance_mode: bool = False):
444
+ try:
445
+ resulting_data = self.get_resulting_data()
446
+ if not performance_mode:
447
+ external_sampler = ExternalSampler(lf=resulting_data.data_frame, file_ref=self.hash,
448
+ wait_on_completion=True, node_id=self.node_id, flow_id=flow_id)
449
+ self.store_example_data_generator(external_sampler)
450
+ if self.results.errors is None and not self.node_stats.is_canceled:
451
+ self.node_stats.has_run = True
452
+ self.node_schema.result_schema = resulting_data.schema
453
+
454
+ except Exception as e:
455
+ logger.warning(f"Error with step {self.__name__}")
456
+ logger.error(str(e))
457
+ self.results.errors = str(e)
458
+ self.node_stats.has_run = False
459
+ raise e
460
+
461
+ if self.node_stats.has_run:
462
+ for step in self.leads_to_nodes:
463
+ if not self.node_settings.streamable:
464
+ step.node_settings.streamable = self.node_settings.streamable
465
+
466
+ def execute_remote(self, performance_mode: bool = False, node_logger: NodeLogger = None):
467
+ # flow_logger = logger if flow_logger is None else flow_logger
468
+ if node_logger is None:
469
+ raise Exception('Node logger is not defined')
470
+ if self.node_settings.cache_results and results_exists(self.hash):
471
+ try:
472
+ self.results.resulting_data = get_external_df_result(self.hash)
473
+ self._cache_progress = None
474
+ return
475
+ except Exception as e:
476
+ node_logger.warning('Failed to read the cache, rerunning the code')
477
+ if self.node_type == 'output':
478
+ self.results.resulting_data = self.get_resulting_data()
479
+ self.node_stats.has_run = True
480
+ return
481
+ try:
482
+ self.get_resulting_data()
483
+ except Exception as e:
484
+ self.results.errors = 'Error with creating the lazy frame, most likely due to invalid graph'
485
+ raise e
486
+ if not performance_mode:
487
+ external_df_fetcher = ExternalDfFetcher(lf=self.get_resulting_data().data_frame,
488
+ file_ref=self.hash, wait_on_completion=False,
489
+ flow_id=node_logger.flow_id,
490
+ node_id=self.node_id)
491
+ self._fetch_cached_df = external_df_fetcher
492
+ try:
493
+ lf = external_df_fetcher.get_result()
494
+ self.results.resulting_data = FlowDataEngine(
495
+ lf, number_of_records=ExternalDfFetcher(lf=lf, operation_type='calculate_number_of_records',
496
+ flow_id=node_logger.flow_id, node_id=self.node_id).result
497
+ )
498
+ if not performance_mode:
499
+ self.store_example_data_generator(external_df_fetcher)
500
+ self.node_stats.has_run = True
501
+
502
+ except Exception as e:
503
+ node_logger.error('Error with external process')
504
+ if external_df_fetcher.error_code == -1:
505
+ try:
506
+ self.results.resulting_data = self.get_resulting_data()
507
+ self.results.warnings = ('Error with external process (unknown error), '
508
+ 'likely the process was killed by the server because of memory constraints, '
509
+ 'continue with the process. '
510
+ 'We cannot display example data...')
511
+ except Exception as e:
512
+ self.results.errors = str(e)
513
+ raise e
514
+ elif external_df_fetcher.error_description is None:
515
+ self.results.errors = str(e)
516
+ raise e
517
+ else:
518
+ self.results.errors = external_df_fetcher.error_description
519
+ raise Exception(external_df_fetcher.error_description)
520
+ finally:
521
+ self._fetch_cached_df = None
522
+
523
+ def prepare_before_run(self):
524
+ self.results.errors = None
525
+ self.results.resulting_data = None
526
+ self.results.example_data = None
527
+
528
+ def cancel(self):
529
+ if self._fetch_cached_df is not None:
530
+ self._fetch_cached_df.cancel()
531
+ self.node_stats.is_canceled = True
532
+ else:
533
+ logger.warning('No external process to cancel')
534
+ self.node_stats.is_canceled = True
535
+
536
+ def execute_node(self, run_location: schemas.ExecutionLocationsLiteral, reset_cache: bool = False,
537
+ performance_mode: bool = False, retry: bool = True, node_logger: NodeLogger = None):
538
+ if node_logger is None:
539
+ raise Exception('Flow logger is not defined')
540
+ # node_logger = flow_logger.get_node_logger(self.node_id)
541
+ if reset_cache:
542
+ self.remove_cache()
543
+ self.node_stats.has_run = False
544
+ if self.is_setup:
545
+ node_logger.info(f'Starting to run {self.__name__}')
546
+ if self.needs_run(performance_mode, node_logger, run_location):
547
+ self.prepare_before_run()
548
+ try:
549
+ if ((run_location == 'remote' or (self.node_default.transform_type == 'wide')
550
+ and not run_location == 'local')) or self.node_settings.cache_results:
551
+ node_logger.info('Running the node remotely')
552
+ if self.node_settings.cache_results:
553
+ performance_mode = False
554
+ self.execute_remote(performance_mode=(performance_mode if not self.node_settings.cache_results
555
+ else False),
556
+ node_logger=node_logger
557
+ )
558
+ else:
559
+ node_logger.info('Running the node locally')
560
+ self.execute_local(performance_mode=performance_mode, flow_id=node_logger.flow_id)
561
+ except Exception as e:
562
+ if 'No such file or directory (os error' in str(e) and retry:
563
+ logger.warning('Error with the input node, starting to rerun the input node...')
564
+ all_inputs: List[FlowNode] = self.node_inputs.get_all_inputs()
565
+ for node_input in all_inputs:
566
+ node_input.execute_node(run_location=run_location,
567
+ performance_mode=performance_mode, retry=True,
568
+ reset_cache=True,
569
+ node_logger=node_logger)
570
+ self.execute_node(run_location=run_location,
571
+ performance_mode=performance_mode, retry=False,
572
+ node_logger=node_logger)
573
+ else:
574
+ self.node_stats.has_run = False
575
+ self.results.errors = str(e)
576
+ node_logger.error(f'Error with running the node: {e}')
577
+
578
+ else:
579
+ node_logger.info('Node has already run, not running the node')
580
+ else:
581
+ node_logger.warning(f'Node {self.__name__} is not setup, cannot run the node')
582
+
583
+ def store_example_data_generator(self, external_df_fetcher: ExternalDfFetcher | ExternalSampler):
584
+ if external_df_fetcher.status is not None:
585
+ file_ref = external_df_fetcher.status.file_ref
586
+ self.results.example_data_path = file_ref
587
+ self.results.example_data_generator = get_read_top_n(file_path=file_ref, n=100)
588
+ else:
589
+ logger.error('Could not get the sample data, the external process is not ready')
590
+
591
+ def needs_reset(self) -> bool:
592
+ return self._hash != self.calculate_hash(self.setting_input)
593
+
594
+ def reset(self, deep: bool = False):
595
+ needs_reset = self.needs_reset() or deep
596
+ if needs_reset:
597
+ logger.info(f'{self.node_id}: Node needs reset')
598
+ self.node_stats.has_run = False
599
+ self.results.reset()
600
+ if self.schema_callback:
601
+ self.schema_callback.reset()
602
+ self.node_schema.result_schema = None
603
+ self.node_schema.predicted_schema = None
604
+ self._hash = None
605
+ self.node_information.is_setup = None
606
+ self.evaluate_nodes()
607
+
608
+ def delete_lead_to_node(self, node_id: int) -> bool:
609
+ logger.info(f'Deleting lead to node: {node_id}')
610
+ for i, lead_to_node in enumerate(self.leads_to_nodes):
611
+ logger.info(f'Checking lead to node: {lead_to_node.node_id}')
612
+ if lead_to_node.node_id == node_id:
613
+ logger.info(f'Found the node to delete: {node_id}')
614
+ self.leads_to_nodes.pop(i)
615
+ return True
616
+ return False
617
+
618
+ def delete_input_node(self, node_id: int, connection_type: input_schema.InputConnectionClass = 'input-0',
619
+ complete: bool = False) -> bool:
620
+ # connection type must be in right, left or main
621
+ deleted: bool = False
622
+ if connection_type == 'input-0':
623
+ for i, node in enumerate(self.node_inputs.main_inputs):
624
+ if node.node_id == node_id:
625
+ self.node_inputs.main_inputs.pop(i)
626
+ deleted = True
627
+ if not complete:
628
+ continue
629
+ elif connection_type == 'input-1' or complete:
630
+ if self.node_inputs.right_input is not None and self.node_inputs.right_input.node_id == node_id:
631
+ self.node_inputs.right_input = None
632
+ deleted = True
633
+ elif connection_type == 'input-2' or complete:
634
+ if self.node_inputs.left_input is not None and self.node_inputs.right_input.node_id == node_id:
635
+ self.node_inputs.left_input = None
636
+ deleted = True
637
+ else:
638
+ logger.warning('Could not find the connection to delete...')
639
+ if deleted:
640
+ self.reset()
641
+ return deleted
642
+
643
+ def __repr__(self):
644
+ return f"Node id: {self.node_id} ({self.node_type})"
645
+
646
+ def _get_readable_schema(self):
647
+ if self.is_setup:
648
+ output = []
649
+ for s in self.schema:
650
+ output.append(dict(column_name=s.column_name, data_type=s.data_type))
651
+ return output
652
+
653
+ def get_repr(self):
654
+ return dict(FlowNode=
655
+ dict(node_id=self.node_id,
656
+ step_name=self.__name__,
657
+ output_columns=self.node_schema.output_columns,
658
+ output_schema=self._get_readable_schema()))
659
+
660
+ @property
661
+ def number_of_leads_to_nodes(self) -> int:
662
+ if self.is_setup:
663
+ return len(self.leads_to_nodes)
664
+
665
+ @property
666
+ def has_next_step(self) -> bool:
667
+ return len(self.leads_to_nodes) > 0
668
+
669
+ @property
670
+ def has_input(self) -> bool:
671
+ return len(self.all_inputs) > 0
672
+
673
+ @property
674
+ def singular_input(self) -> bool:
675
+ return self.node_template.input == 1
676
+
677
+ @property
678
+ def singular_main_input(self) -> "FlowNode":
679
+ if self.singular_input:
680
+ return self.all_inputs[0]
681
+
682
+ def get_table_example(self, include_data: bool = False) -> TableExample | None:
683
+ self.print('Getting a table example')
684
+ if self.node_type == 'output':
685
+ self.print('getting the table example')
686
+ return self.main_input[0].get_table_example(include_data)
687
+ if self.is_setup and include_data:
688
+ logger.info('getting the table example since the node has run')
689
+ example_data_getter = self.results.example_data_generator
690
+ if example_data_getter is not None:
691
+ data = example_data_getter().to_pylist()
692
+ if data is None:
693
+ data = []
694
+ else:
695
+ data = []
696
+ schema = [FileColumn.model_validate(c.get_column_repr()) for c in self.schema]
697
+ fl = self.get_resulting_data()
698
+ return TableExample(node_id=self.node_id,
699
+ name=str(self.node_id), number_of_records=999,
700
+ number_of_columns=fl.number_of_fields,
701
+ table_schema=schema, columns=fl.columns, data=data)
702
+ else:
703
+ logger.warning('getting the table example but the node has not run')
704
+ try:
705
+ schema = [FileColumn.model_validate(c.get_column_repr()) for c in self.schema]
706
+ except Exception as e:
707
+ logger.warning(e)
708
+ schema = []
709
+ columns = [s.name for s in schema]
710
+ return TableExample(node_id=self.node_id,
711
+ name=str(self.node_id), number_of_records=0,
712
+ number_of_columns=len(columns),
713
+ table_schema=schema, columns=columns,
714
+ data=[])
715
+
716
+ def calculate_settings_out_select(self):
717
+ pass
718
+
719
+ def get_node_data(self, flow_id: int, include_example: bool = False) -> NodeData:
720
+ node = NodeData(flow_id=flow_id,
721
+ node_id=self.node_id,
722
+ has_run=self.node_stats.has_run,
723
+ setting_input=self.setting_input,
724
+ flow_type=self.node_type)
725
+ if self.main_input:
726
+ node.main_input = self.main_input[0].get_table_example()
727
+ if self.left_input:
728
+ node.left_input = self.left_input.get_table_example()
729
+ if self.right_input:
730
+ node.right_input = self.right_input.get_table_example()
731
+ if self.is_setup:
732
+ node.main_output = self.get_table_example(include_example)
733
+ node = setting_generator.get_setting_generator(self.node_type)(node)
734
+
735
+ node = setting_updator.get_setting_updator(self.node_type)(node)
736
+ return node
737
+
738
+ def get_output_data(self) -> TableExample:
739
+ return self.get_table_example(True)
740
+
741
+ def get_node_input(self) -> schemas.NodeInput:
742
+ return schemas.NodeInput(pos_y=self.setting_input.pos_y,
743
+ pos_x=self.setting_input.pos_x,
744
+ id=self.node_id,
745
+ **self.node_template.__dict__)
746
+
747
+ def get_edge_input(self) -> List[schemas.NodeEdge]:
748
+ edges = []
749
+ if self.node_inputs.main_inputs is not None:
750
+ for i, main_input in enumerate(self.node_inputs.main_inputs):
751
+ edges.append(schemas.NodeEdge(id=f'{main_input.node_id}-{self.node_id}-{i}',
752
+ source=main_input.node_id,
753
+ target=self.node_id,
754
+ sourceHandle='output-0',
755
+ targetHandle='input-0',
756
+ ))
757
+ if self.node_inputs.left_input is not None:
758
+ edges.append(schemas.NodeEdge(id=f'{self.node_inputs.left_input.node_id}-{self.node_id}-right',
759
+ source=self.node_inputs.left_input.node_id,
760
+ target=self.node_id,
761
+ sourceHandle='output-0',
762
+ targetHandle='input-2',
763
+ ))
764
+ if self.node_inputs.right_input is not None:
765
+ edges.append(schemas.NodeEdge(id=f'{self.node_inputs.right_input.node_id}-{self.node_id}-left',
766
+ source=self.node_inputs.right_input.node_id,
767
+ target=self.node_id,
768
+ sourceHandle='output-0',
769
+ targetHandle='input-1',
770
+ ))
771
+ return edges