Flowfile 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- build_backends/__init__.py +0 -0
- build_backends/main.py +313 -0
- build_backends/main_prd.py +202 -0
- flowfile/__init__.py +71 -0
- flowfile/__main__.py +24 -0
- flowfile-0.2.2.dist-info/LICENSE +21 -0
- flowfile-0.2.2.dist-info/METADATA +225 -0
- flowfile-0.2.2.dist-info/RECORD +171 -0
- flowfile-0.2.2.dist-info/WHEEL +4 -0
- flowfile-0.2.2.dist-info/entry_points.txt +9 -0
- flowfile_core/__init__.py +13 -0
- flowfile_core/auth/__init__.py +0 -0
- flowfile_core/auth/jwt.py +140 -0
- flowfile_core/auth/models.py +40 -0
- flowfile_core/auth/secrets.py +178 -0
- flowfile_core/configs/__init__.py +35 -0
- flowfile_core/configs/flow_logger.py +433 -0
- flowfile_core/configs/node_store/__init__.py +0 -0
- flowfile_core/configs/node_store/nodes.py +98 -0
- flowfile_core/configs/settings.py +120 -0
- flowfile_core/database/__init__.py +0 -0
- flowfile_core/database/connection.py +51 -0
- flowfile_core/database/init_db.py +45 -0
- flowfile_core/database/models.py +41 -0
- flowfile_core/fileExplorer/__init__.py +0 -0
- flowfile_core/fileExplorer/funcs.py +259 -0
- flowfile_core/fileExplorer/utils.py +53 -0
- flowfile_core/flowfile/FlowfileFlow.py +1403 -0
- flowfile_core/flowfile/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
- flowfile_core/flowfile/analytics/__init__.py +0 -0
- flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
- flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
- flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
- flowfile_core/flowfile/analytics/utils.py +9 -0
- flowfile_core/flowfile/connection_manager/__init__.py +3 -0
- flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
- flowfile_core/flowfile/connection_manager/models.py +10 -0
- flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
- flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
- flowfile_core/flowfile/database_connection_manager/models.py +15 -0
- flowfile_core/flowfile/extensions.py +36 -0
- flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
- flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
- flowfile_core/flowfile/flow_data_engine/types.py +0 -0
- flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
- flowfile_core/flowfile/flow_node/__init__.py +0 -0
- flowfile_core/flowfile/flow_node/flow_node.py +771 -0
- flowfile_core/flowfile/flow_node/models.py +111 -0
- flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
- flowfile_core/flowfile/handler.py +123 -0
- flowfile_core/flowfile/manage/__init__.py +0 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
- flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
- flowfile_core/flowfile/manage/open_flowfile.py +136 -0
- flowfile_core/flowfile/setting_generator/__init__.py +2 -0
- flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
- flowfile_core/flowfile/setting_generator/settings.py +176 -0
- flowfile_core/flowfile/sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
- flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
- flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
- flowfile_core/flowfile/util/__init__.py +0 -0
- flowfile_core/flowfile/util/calculate_layout.py +137 -0
- flowfile_core/flowfile/util/execution_orderer.py +141 -0
- flowfile_core/flowfile/utils.py +106 -0
- flowfile_core/main.py +138 -0
- flowfile_core/routes/__init__.py +0 -0
- flowfile_core/routes/auth.py +34 -0
- flowfile_core/routes/logs.py +163 -0
- flowfile_core/routes/public.py +10 -0
- flowfile_core/routes/routes.py +601 -0
- flowfile_core/routes/secrets.py +85 -0
- flowfile_core/run_lock.py +11 -0
- flowfile_core/schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
- flowfile_core/schemas/defaults.py +9 -0
- flowfile_core/schemas/external_sources/__init__.py +0 -0
- flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
- flowfile_core/schemas/input_schema.py +477 -0
- flowfile_core/schemas/models.py +193 -0
- flowfile_core/schemas/output_model.py +115 -0
- flowfile_core/schemas/schemas.py +106 -0
- flowfile_core/schemas/transform_schema.py +569 -0
- flowfile_core/secrets/__init__.py +0 -0
- flowfile_core/secrets/secrets.py +64 -0
- flowfile_core/utils/__init__.py +0 -0
- flowfile_core/utils/arrow_reader.py +247 -0
- flowfile_core/utils/excel_file_manager.py +18 -0
- flowfile_core/utils/fileManager.py +45 -0
- flowfile_core/utils/fl_executor.py +38 -0
- flowfile_core/utils/utils.py +8 -0
- flowfile_frame/__init__.py +56 -0
- flowfile_frame/__main__.py +12 -0
- flowfile_frame/adapters.py +17 -0
- flowfile_frame/expr.py +1163 -0
- flowfile_frame/flow_frame.py +2093 -0
- flowfile_frame/group_frame.py +199 -0
- flowfile_frame/join.py +75 -0
- flowfile_frame/selectors.py +242 -0
- flowfile_frame/utils.py +184 -0
- flowfile_worker/__init__.py +55 -0
- flowfile_worker/configs.py +95 -0
- flowfile_worker/create/__init__.py +37 -0
- flowfile_worker/create/funcs.py +146 -0
- flowfile_worker/create/models.py +86 -0
- flowfile_worker/create/pl_types.py +35 -0
- flowfile_worker/create/read_excel_tables.py +110 -0
- flowfile_worker/create/utils.py +84 -0
- flowfile_worker/external_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
- flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
- flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- flowfile_worker/external_sources/sql_source/__init__.py +0 -0
- flowfile_worker/external_sources/sql_source/main.py +56 -0
- flowfile_worker/external_sources/sql_source/models.py +72 -0
- flowfile_worker/flow_logger.py +58 -0
- flowfile_worker/funcs.py +327 -0
- flowfile_worker/main.py +108 -0
- flowfile_worker/models.py +95 -0
- flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
- flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
- flowfile_worker/polars_fuzzy_match/models.py +36 -0
- flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
- flowfile_worker/polars_fuzzy_match/process.py +86 -0
- flowfile_worker/polars_fuzzy_match/utils.py +50 -0
- flowfile_worker/process_manager.py +36 -0
- flowfile_worker/routes.py +440 -0
- flowfile_worker/secrets.py +148 -0
- flowfile_worker/spawner.py +187 -0
- flowfile_worker/utils.py +25 -0
- test_utils/__init__.py +3 -0
- test_utils/postgres/__init__.py +1 -0
- test_utils/postgres/commands.py +109 -0
- test_utils/postgres/fixtures.py +417 -0
|
@@ -0,0 +1,1403 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import pickle
|
|
3
|
+
import polars as pl
|
|
4
|
+
import fastexcel
|
|
5
|
+
from fastapi.exceptions import HTTPException
|
|
6
|
+
from time import time
|
|
7
|
+
from functools import partial
|
|
8
|
+
from typing import List, Dict, Union, Callable, Any, Optional, Tuple
|
|
9
|
+
from uuid import uuid1
|
|
10
|
+
from copy import deepcopy
|
|
11
|
+
from pyarrow.parquet import ParquetFile
|
|
12
|
+
from flowfile_core.configs import logger
|
|
13
|
+
from flowfile_core.configs.flow_logger import FlowLogger
|
|
14
|
+
from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
|
|
15
|
+
from flowfile_core.flowfile.sources.external_sources.airbyte_sources.settings import airbyte_settings_from_config
|
|
16
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import type_to_polars_str, FlowfileColumn
|
|
17
|
+
from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.settings_validator import (calculate_fuzzy_match_schema,
|
|
18
|
+
pre_calculate_pivot_schema)
|
|
19
|
+
from flowfile_core.utils.arrow_reader import get_read_top_n
|
|
20
|
+
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine, execute_polars_code
|
|
21
|
+
from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_xlsx_datatypes, \
|
|
22
|
+
get_calamine_xlsx_data_types
|
|
23
|
+
from flowfile_core.flowfile.sources import external_sources
|
|
24
|
+
from flowfile_core.schemas import input_schema, schemas, transform_schema
|
|
25
|
+
from flowfile_core.schemas.output_model import TableExample, NodeData, NodeResult, RunInformation
|
|
26
|
+
from flowfile_core.flowfile.utils import snake_case_to_camel_case
|
|
27
|
+
from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
|
|
28
|
+
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
29
|
+
from flowfile_core.flowfile.util.execution_orderer import determine_execution_order
|
|
30
|
+
from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
|
|
31
|
+
from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (ExternalAirbyteFetcher,
|
|
32
|
+
ExternalDatabaseFetcher,
|
|
33
|
+
ExternalDatabaseWriter,
|
|
34
|
+
ExternalDfFetcher)
|
|
35
|
+
from flowfile_core.secrets.secrets import get_encrypted_secret, decrypt_secret
|
|
36
|
+
from flowfile_core.flowfile.sources.external_sources.sql_source import utils as sql_utils, models as sql_models
|
|
37
|
+
from flowfile_core.flowfile.sources.external_sources.sql_source.sql_source import SqlSource, BaseSqlSource
|
|
38
|
+
from flowfile_core.flowfile.database_connection_manager.db_connections import get_local_database_connection
|
|
39
|
+
from flowfile_core.flowfile.util.calculate_layout import calculate_layered_layout
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_xlsx_schema(engine: str, file_path: str, sheet_name: str, start_row: int, start_column: int,
|
|
43
|
+
end_row: int, end_column: int, has_headers: bool):
|
|
44
|
+
try:
|
|
45
|
+
logger.info('Starting to calculate the schema')
|
|
46
|
+
if engine == 'openpyxl':
|
|
47
|
+
max_col = end_column if end_column > 0 else None
|
|
48
|
+
return get_open_xlsx_datatypes(file_path=file_path,
|
|
49
|
+
sheet_name=sheet_name,
|
|
50
|
+
min_row=start_row + 1,
|
|
51
|
+
min_col=start_column + 1,
|
|
52
|
+
max_row=100,
|
|
53
|
+
max_col=max_col, has_headers=has_headers)
|
|
54
|
+
elif engine == 'calamine':
|
|
55
|
+
return get_calamine_xlsx_data_types(file_path=file_path,
|
|
56
|
+
sheet_name=sheet_name,
|
|
57
|
+
start_row=start_row,
|
|
58
|
+
end_row=end_row)
|
|
59
|
+
logger.info('done calculating the schema')
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.error(e)
|
|
62
|
+
return []
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def skip_node_message(flow_logger: FlowLogger, nodes: List[FlowNode]) -> None:
|
|
66
|
+
if len(nodes) > 0:
|
|
67
|
+
msg = "\n".join(str(node) for node in nodes)
|
|
68
|
+
flow_logger.warning(f'skipping nodes:\n{msg}')
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def execution_order_message(flow_logger: FlowLogger, nodes: List[FlowNode]) -> None:
|
|
72
|
+
msg = "\n".join(str(node) for node in nodes)
|
|
73
|
+
flow_logger.info(f'execution order:\n{msg}')
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_xlsx_schema_callback(engine: str, file_path: str, sheet_name: str, start_row: int, start_column: int,
|
|
77
|
+
end_row: int, end_column: int, has_headers: bool):
|
|
78
|
+
return partial(get_xlsx_schema, engine=engine, file_path=file_path, sheet_name=sheet_name, start_row=start_row,
|
|
79
|
+
start_column=start_column, end_row=end_row, end_column=end_column, has_headers=has_headers)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class FlowGraph:
|
|
83
|
+
"""
|
|
84
|
+
FlowGraph is a class that enables Extract, Transform and Load (ETL) operations
|
|
85
|
+
on data. It allows you to create a Directed Acyclic Graph (DAG) where each
|
|
86
|
+
node represents a step in the ETL pipeline.
|
|
87
|
+
|
|
88
|
+
The class offers methods to add transformations and data sources, as well as
|
|
89
|
+
methods to run the transformations and generate results.
|
|
90
|
+
|
|
91
|
+
Attributes:
|
|
92
|
+
_input_cols (set): A set that stores the input columns for the transformations.
|
|
93
|
+
_output_cols (set): A set that stores the output columns from the transformations.
|
|
94
|
+
"""
|
|
95
|
+
uuid: str
|
|
96
|
+
depends_on: Dict[int, Union[ParquetFile, FlowDataEngine, "FlowGraph", pl.DataFrame,]]
|
|
97
|
+
_flow_id: int
|
|
98
|
+
_input_data: Union[ParquetFile, FlowDataEngine, "FlowGraph"]
|
|
99
|
+
_input_cols: List[str]
|
|
100
|
+
_output_cols: List[str]
|
|
101
|
+
_node_db: Dict[Union[str, int], FlowNode]
|
|
102
|
+
_node_ids: List[Union[str, int]]
|
|
103
|
+
_results: Optional[FlowDataEngine] = None
|
|
104
|
+
cache_results: bool = False
|
|
105
|
+
schema: Optional[List[FlowfileColumn]] = None
|
|
106
|
+
has_over_row_function: bool = False
|
|
107
|
+
_flow_starts: List[Union[int, str]] = None
|
|
108
|
+
node_results: List[NodeResult] = None
|
|
109
|
+
latest_run_info: Optional[RunInformation] = None
|
|
110
|
+
start_datetime: datetime = None
|
|
111
|
+
end_datetime: datetime = None
|
|
112
|
+
nodes_completed: int = 0
|
|
113
|
+
flow_settings: schemas.FlowSettings = None
|
|
114
|
+
flow_logger: FlowLogger
|
|
115
|
+
|
|
116
|
+
def __init__(self, flow_id: int,
|
|
117
|
+
flow_settings: schemas.FlowSettings,
|
|
118
|
+
name: str = None, input_cols: List[str] = None,
|
|
119
|
+
output_cols: List[str] = None,
|
|
120
|
+
path_ref: str = None,
|
|
121
|
+
input_flow: Union[ParquetFile, FlowDataEngine, "FlowGraph"] = None,
|
|
122
|
+
cache_results: bool = False):
|
|
123
|
+
self.flow_settings = flow_settings
|
|
124
|
+
self.uuid = str(uuid1())
|
|
125
|
+
self.nodes_completed = 0
|
|
126
|
+
self.start_datetime = None
|
|
127
|
+
self.end_datetime = None
|
|
128
|
+
self.latest_run_info = None
|
|
129
|
+
self.node_results = []
|
|
130
|
+
self._flow_id = flow_id
|
|
131
|
+
self.flow_logger = FlowLogger(flow_id)
|
|
132
|
+
self._flow_starts: List[FlowNode] = []
|
|
133
|
+
self._results = None
|
|
134
|
+
self.schema = None
|
|
135
|
+
self.has_over_row_function = False
|
|
136
|
+
self._input_cols = [] if input_cols is None else input_cols
|
|
137
|
+
self._output_cols = [] if output_cols is None else output_cols
|
|
138
|
+
self._node_ids = []
|
|
139
|
+
self._node_db = {}
|
|
140
|
+
self.cache_results = cache_results
|
|
141
|
+
self.__name__ = name if name else id(self)
|
|
142
|
+
self.depends_on = {}
|
|
143
|
+
if path_ref is not None:
|
|
144
|
+
self.add_datasource(input_schema.NodeDatasource(file_path=path_ref))
|
|
145
|
+
elif input_flow is not None:
|
|
146
|
+
self.add_datasource(input_file=input_flow)
|
|
147
|
+
|
|
148
|
+
def add_node_promise(self, node_promise: input_schema.NodePromise):
|
|
149
|
+
|
|
150
|
+
def placeholder(n: FlowNode = None):
|
|
151
|
+
if n is None:
|
|
152
|
+
return FlowDataEngine()
|
|
153
|
+
return n
|
|
154
|
+
|
|
155
|
+
self.add_node_step(node_id=node_promise.node_id, node_type=node_promise.node_type, function=placeholder,
|
|
156
|
+
setting_input=node_promise)
|
|
157
|
+
|
|
158
|
+
def apply_layout(self, y_spacing: int = 150, x_spacing: int = 200, initial_y: int = 100):
|
|
159
|
+
"""
|
|
160
|
+
Calculates and applies a layered layout to all nodes in the graph.
|
|
161
|
+
Updates the pos_x and pos_y attributes of the node setting inputs.
|
|
162
|
+
"""
|
|
163
|
+
self.flow_logger.info("Applying layered layout...")
|
|
164
|
+
start_time = time()
|
|
165
|
+
try:
|
|
166
|
+
# Calculate new positions for all nodes
|
|
167
|
+
new_positions = calculate_layered_layout(
|
|
168
|
+
self, y_spacing=y_spacing, x_spacing=x_spacing, initial_y=initial_y
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
if not new_positions:
|
|
172
|
+
self.flow_logger.warning("Layout calculation returned no positions.")
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
# Apply the new positions to the setting_input of each node
|
|
176
|
+
updated_count = 0
|
|
177
|
+
for node_id, (pos_x, pos_y) in new_positions.items():
|
|
178
|
+
node = self.get_node(node_id)
|
|
179
|
+
if node and hasattr(node, 'setting_input'):
|
|
180
|
+
setting = node.setting_input
|
|
181
|
+
if hasattr(setting, 'pos_x') and hasattr(setting, 'pos_y'):
|
|
182
|
+
setting.pos_x = pos_x
|
|
183
|
+
setting.pos_y = pos_y
|
|
184
|
+
updated_count += 1
|
|
185
|
+
else:
|
|
186
|
+
self.flow_logger.warning(f"Node {node_id} setting_input ({type(setting)}) lacks pos_x/pos_y attributes.")
|
|
187
|
+
elif node:
|
|
188
|
+
self.flow_logger.warning(f"Node {node_id} lacks setting_input attribute.")
|
|
189
|
+
# else: Node not found, already warned by calculate_layered_layout
|
|
190
|
+
|
|
191
|
+
end_time = time()
|
|
192
|
+
self.flow_logger.info(f"Layout applied to {updated_count}/{len(self.nodes)} nodes in {end_time - start_time:.2f} seconds.")
|
|
193
|
+
|
|
194
|
+
except Exception as e:
|
|
195
|
+
self.flow_logger.error(f"Error applying layout: {e}")
|
|
196
|
+
raise # Optional: re-raise the exception
|
|
197
|
+
|
|
198
|
+
def add_initial_node_analysis(self, node_promise: input_schema.NodePromise):
|
|
199
|
+
node_analysis = create_graphic_walker_node_from_node_promise(node_promise)
|
|
200
|
+
self.add_explore_data(node_analysis)
|
|
201
|
+
|
|
202
|
+
def add_explore_data(self, node_analysis: input_schema.NodeExploreData):
|
|
203
|
+
sample_size: int = 10000
|
|
204
|
+
|
|
205
|
+
def analysis_preparation(flowfile_table: FlowDataEngine):
|
|
206
|
+
|
|
207
|
+
if flowfile_table.number_of_records<0:
|
|
208
|
+
|
|
209
|
+
number_of_records = ExternalDfFetcher(
|
|
210
|
+
lf=flowfile_table.data_frame,
|
|
211
|
+
operation_type="calculate_number_of_records",
|
|
212
|
+
flow_id=self.flow_id,
|
|
213
|
+
node_id=node.node_id,
|
|
214
|
+
).result
|
|
215
|
+
else:
|
|
216
|
+
number_of_records = flowfile_table.number_of_records
|
|
217
|
+
if number_of_records > sample_size:
|
|
218
|
+
flowfile_table = flowfile_table.get_sample(sample_size, random=True)
|
|
219
|
+
|
|
220
|
+
external_sampler = ExternalDfFetcher(
|
|
221
|
+
lf=flowfile_table.data_frame,
|
|
222
|
+
file_ref=node.hash,
|
|
223
|
+
wait_on_completion=True,
|
|
224
|
+
node_id=node.node_id,
|
|
225
|
+
flow_id=self.flow_id,
|
|
226
|
+
)
|
|
227
|
+
node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref, 10000)
|
|
228
|
+
return flowfile_table
|
|
229
|
+
|
|
230
|
+
def schema_callback():
|
|
231
|
+
node = self.get_node(node_analysis.node_id)
|
|
232
|
+
if len(node.all_inputs) == 1:
|
|
233
|
+
input_node = node.all_inputs[0]
|
|
234
|
+
return input_node.schema
|
|
235
|
+
else:
|
|
236
|
+
return [FlowfileColumn.from_input('col_1', 'na')]
|
|
237
|
+
|
|
238
|
+
self.add_node_step(node_id=node_analysis.node_id, node_type='explore_data',
|
|
239
|
+
function=analysis_preparation,
|
|
240
|
+
setting_input=node_analysis, schema_callback=schema_callback)
|
|
241
|
+
node = self.get_node(node_analysis.node_id)
|
|
242
|
+
|
|
243
|
+
@property
|
|
244
|
+
def flow_id(self) -> int:
|
|
245
|
+
return self._flow_id
|
|
246
|
+
|
|
247
|
+
@flow_id.setter
|
|
248
|
+
def flow_id(self, new_id: int):
|
|
249
|
+
self._flow_id = new_id
|
|
250
|
+
for node in self.nodes:
|
|
251
|
+
if hasattr(node.setting_input, 'flow_id'):
|
|
252
|
+
node.setting_input.flow_id = new_id
|
|
253
|
+
self.flow_settings.flow_id = new_id
|
|
254
|
+
|
|
255
|
+
def __repr__(self):
|
|
256
|
+
"""
|
|
257
|
+
Official string representation of the FlowGraph class.
|
|
258
|
+
"""
|
|
259
|
+
settings_str = " -" + '\n -'.join(f"{k}: {v}" for k, v in self.flow_settings)
|
|
260
|
+
return f"FlowGraph(\nNodes: {self._node_db}\n\nSettings:\n{settings_str}"
|
|
261
|
+
|
|
262
|
+
def get_nodes_overview(self):
|
|
263
|
+
output = []
|
|
264
|
+
for v in self._node_db.values():
|
|
265
|
+
output.append(v.get_repr())
|
|
266
|
+
return output
|
|
267
|
+
|
|
268
|
+
def remove_from_output_cols(self, columns: List[str]):
|
|
269
|
+
cols = set(columns)
|
|
270
|
+
self._output_cols = [c for c in self._output_cols if c not in cols]
|
|
271
|
+
|
|
272
|
+
def get_node(self, node_id: Union[int, str] = None) -> FlowNode:
|
|
273
|
+
if node_id is None:
|
|
274
|
+
node_id = self._node_ids[-1]
|
|
275
|
+
node = self._node_db.get(node_id)
|
|
276
|
+
if node is not None:
|
|
277
|
+
return node
|
|
278
|
+
|
|
279
|
+
def add_pivot(self, pivot_settings: input_schema.NodePivot):
|
|
280
|
+
def _func(fl: FlowDataEngine):
|
|
281
|
+
return fl.do_pivot(pivot_settings.pivot_input, self.flow_logger.get_node_logger(pivot_settings.node_id))
|
|
282
|
+
|
|
283
|
+
self.add_node_step(node_id=pivot_settings.node_id,
|
|
284
|
+
function=_func,
|
|
285
|
+
node_type='pivot',
|
|
286
|
+
setting_input=pivot_settings,
|
|
287
|
+
input_node_ids=[pivot_settings.depending_on_id])
|
|
288
|
+
|
|
289
|
+
node = self.get_node(pivot_settings.node_id)
|
|
290
|
+
|
|
291
|
+
def schema_callback():
|
|
292
|
+
input_data = node.singular_main_input.get_resulting_data() # get from the previous step the data
|
|
293
|
+
input_data.lazy = True # ensure the dataset is lazy
|
|
294
|
+
input_lf = input_data.data_frame # get the lazy frame
|
|
295
|
+
return pre_calculate_pivot_schema(input_data.schema, pivot_settings.pivot_input, input_lf=input_lf)
|
|
296
|
+
node.schema_callback = schema_callback
|
|
297
|
+
|
|
298
|
+
def add_unpivot(self, unpivot_settings: input_schema.NodeUnpivot):
|
|
299
|
+
|
|
300
|
+
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
301
|
+
return fl.unpivot(unpivot_settings.unpivot_input)
|
|
302
|
+
|
|
303
|
+
self.add_node_step(node_id=unpivot_settings.node_id,
|
|
304
|
+
function=_func,
|
|
305
|
+
node_type='unpivot',
|
|
306
|
+
setting_input=unpivot_settings,
|
|
307
|
+
input_node_ids=[unpivot_settings.depending_on_id])
|
|
308
|
+
|
|
309
|
+
def add_union(self, union_settings: input_schema.NodeUnion):
|
|
310
|
+
def _func(*flowfile_tables: FlowDataEngine):
|
|
311
|
+
dfs: List[pl.LazyFrame] | List[pl.DataFrame] = [flt.data_frame for flt in flowfile_tables]
|
|
312
|
+
return FlowDataEngine(pl.concat(dfs, how='diagonal_relaxed'))
|
|
313
|
+
|
|
314
|
+
self.add_node_step(node_id=union_settings.node_id,
|
|
315
|
+
function=_func,
|
|
316
|
+
node_type=f'union',
|
|
317
|
+
setting_input=union_settings,
|
|
318
|
+
input_node_ids=union_settings.depending_on_ids)
|
|
319
|
+
|
|
320
|
+
def add_group_by(self, group_by_settings: input_schema.NodeGroupBy):
|
|
321
|
+
|
|
322
|
+
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
323
|
+
return fl.do_group_by(group_by_settings.groupby_input, False)
|
|
324
|
+
|
|
325
|
+
self.add_node_step(node_id=group_by_settings.node_id,
|
|
326
|
+
function=_func,
|
|
327
|
+
node_type=f'group_by',
|
|
328
|
+
setting_input=group_by_settings,
|
|
329
|
+
input_node_ids=[group_by_settings.depending_on_id])
|
|
330
|
+
|
|
331
|
+
node = self.get_node(group_by_settings.node_id)
|
|
332
|
+
|
|
333
|
+
def schema_callback():
|
|
334
|
+
output_columns = [(c.old_name, c.new_name, c.output_type) for c in group_by_settings.groupby_input.agg_cols]
|
|
335
|
+
depends_on = node.node_inputs.main_inputs[0]
|
|
336
|
+
input_schema_dict: Dict[str, str] = {s.name: s.data_type for s in depends_on.schema}
|
|
337
|
+
output_schema = []
|
|
338
|
+
for old_name, new_name, data_type in output_columns:
|
|
339
|
+
data_type = input_schema_dict[old_name] if data_type is None else data_type
|
|
340
|
+
output_schema.append(FlowfileColumn.from_input(data_type=data_type, column_name=new_name))
|
|
341
|
+
return output_schema
|
|
342
|
+
|
|
343
|
+
node.schema_callback = schema_callback
|
|
344
|
+
|
|
345
|
+
def add_or_update_column_func(self, col_name: str, pl_dtype: pl.DataType, depends_on: FlowNode):
|
|
346
|
+
col_output = FlowfileColumn.from_input(column_name=col_name, data_type=str(pl_dtype))
|
|
347
|
+
schema = depends_on.schema
|
|
348
|
+
col_exist = depends_on.get_flow_file_column_schema(col_name)
|
|
349
|
+
if col_exist is None:
|
|
350
|
+
new_schema = schema + [col_output]
|
|
351
|
+
else:
|
|
352
|
+
new_schema = []
|
|
353
|
+
for s in self.schema:
|
|
354
|
+
if s.name == col_name:
|
|
355
|
+
new_schema.append(col_output)
|
|
356
|
+
else:
|
|
357
|
+
new_schema.append(s)
|
|
358
|
+
return new_schema
|
|
359
|
+
|
|
360
|
+
def add_filter(self, filter_settings: input_schema.NodeFilter):
|
|
361
|
+
is_advanced = filter_settings.filter_input.filter_type == 'advanced'
|
|
362
|
+
if is_advanced:
|
|
363
|
+
predicate = filter_settings.filter_input.advanced_filter
|
|
364
|
+
else:
|
|
365
|
+
_basic_filter = filter_settings.filter_input.basic_filter
|
|
366
|
+
filter_settings.filter_input.advanced_filter = (f'[{_basic_filter.field}]{_basic_filter.filter_type}"'
|
|
367
|
+
f'{_basic_filter.filter_value}"')
|
|
368
|
+
|
|
369
|
+
def _func(fl: FlowDataEngine):
|
|
370
|
+
is_advanced = filter_settings.filter_input.filter_type == 'advanced'
|
|
371
|
+
if is_advanced:
|
|
372
|
+
return fl.do_filter(predicate)
|
|
373
|
+
else:
|
|
374
|
+
basic_filter = filter_settings.filter_input.basic_filter
|
|
375
|
+
if basic_filter.filter_value.isnumeric():
|
|
376
|
+
field_data_type = fl.get_schema_column(basic_filter.field).generic_datatype()
|
|
377
|
+
if field_data_type == 'str':
|
|
378
|
+
_f = f'[{basic_filter.field}]{basic_filter.filter_type}"{basic_filter.filter_value}"'
|
|
379
|
+
else:
|
|
380
|
+
_f = f'[{basic_filter.field}]{basic_filter.filter_type}{basic_filter.filter_value}'
|
|
381
|
+
else:
|
|
382
|
+
_f = f'[{basic_filter.field}]{basic_filter.filter_type}"{basic_filter.filter_value}"'
|
|
383
|
+
filter_settings.filter_input.advanced_filter = _f
|
|
384
|
+
return fl.do_filter(_f)
|
|
385
|
+
|
|
386
|
+
self.add_node_step(filter_settings.node_id, _func,
|
|
387
|
+
node_type='filter',
|
|
388
|
+
renew_schema=False,
|
|
389
|
+
setting_input=filter_settings,
|
|
390
|
+
input_node_ids=[filter_settings.depending_on_id]
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
def add_record_count(self, node_number_of_records: input_schema.NodeRecordCount):
|
|
394
|
+
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
395
|
+
return fl.get_record_count()
|
|
396
|
+
|
|
397
|
+
self.add_node_step(node_id=node_number_of_records.node_id,
|
|
398
|
+
function=_func,
|
|
399
|
+
node_type='record_count',
|
|
400
|
+
setting_input=node_number_of_records,
|
|
401
|
+
input_node_ids=[node_number_of_records.depending_on_id])
|
|
402
|
+
|
|
403
|
+
def add_polars_code(self, node_polars_code: input_schema.NodePolarsCode):
|
|
404
|
+
def _func(*flowfile_tables: FlowDataEngine) -> FlowDataEngine:
|
|
405
|
+
return execute_polars_code(*flowfile_tables, code=node_polars_code.polars_code_input.polars_code)
|
|
406
|
+
|
|
407
|
+
self.add_node_step(node_id=node_polars_code.node_id,
|
|
408
|
+
function=_func,
|
|
409
|
+
node_type='polars_code',
|
|
410
|
+
setting_input=node_polars_code,
|
|
411
|
+
input_node_ids=node_polars_code.depending_on_ids)
|
|
412
|
+
|
|
413
|
+
try:
|
|
414
|
+
polars_code_parser.validate_code(node_polars_code.polars_code_input.polars_code)
|
|
415
|
+
except Exception as e:
|
|
416
|
+
node = self.get_node(node_id=node_polars_code.node_id)
|
|
417
|
+
node.results.errors = str(e)
|
|
418
|
+
|
|
419
|
+
def add_unique(self, unique_settings: input_schema.NodeUnique):
|
|
420
|
+
|
|
421
|
+
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
422
|
+
return fl.make_unique(unique_settings.unique_input)
|
|
423
|
+
|
|
424
|
+
self.add_node_step(node_id=unique_settings.node_id,
|
|
425
|
+
function=_func,
|
|
426
|
+
input_columns=[],
|
|
427
|
+
node_type='unique',
|
|
428
|
+
setting_input=unique_settings,
|
|
429
|
+
input_node_ids=[unique_settings.node_id])
|
|
430
|
+
|
|
431
|
+
def add_graph_solver(self, graph_solver_settings: input_schema.NodeGraphSolver):
|
|
432
|
+
def _func(fl: FlowDataEngine) -> FlowDataEngine:
|
|
433
|
+
return fl.solve_graph(graph_solver_settings.graph_solver_input)
|
|
434
|
+
|
|
435
|
+
self.add_node_step(node_id=graph_solver_settings.node_id,
|
|
436
|
+
function=_func,
|
|
437
|
+
node_type='graph_solver',
|
|
438
|
+
setting_input=graph_solver_settings)
|
|
439
|
+
|
|
440
|
+
def add_formula(self, function_settings: input_schema.NodeFormula):
|
|
441
|
+
error = ""
|
|
442
|
+
if function_settings.function.field.data_type is not None:
|
|
443
|
+
output_type = type_to_polars_str(function_settings.function.field.data_type)
|
|
444
|
+
else:
|
|
445
|
+
output_type = None
|
|
446
|
+
if output_type is not None:
|
|
447
|
+
new_col = [FlowfileColumn.from_input(column_name=function_settings.function.field.name,
|
|
448
|
+
data_type=str(output_type))]
|
|
449
|
+
else:
|
|
450
|
+
new_col = [FlowfileColumn.from_input(function_settings.function.field.name, 'String')]
|
|
451
|
+
|
|
452
|
+
def _func(fl: FlowDataEngine):
|
|
453
|
+
return fl.apply_sql_formula(func=function_settings.function.function,
|
|
454
|
+
col_name=function_settings.function.field.name,
|
|
455
|
+
output_data_type=output_type)
|
|
456
|
+
|
|
457
|
+
self.add_node_step(function_settings.node_id, _func,
|
|
458
|
+
output_schema=new_col,
|
|
459
|
+
node_type='formula',
|
|
460
|
+
renew_schema=False,
|
|
461
|
+
setting_input=function_settings,
|
|
462
|
+
input_node_ids=[function_settings.depending_on_id]
|
|
463
|
+
)
|
|
464
|
+
if error != "":
|
|
465
|
+
node = self.get_node(function_settings.node_id)
|
|
466
|
+
node.results.errors = error
|
|
467
|
+
return False, error
|
|
468
|
+
else:
|
|
469
|
+
return True, ""
|
|
470
|
+
|
|
471
|
+
def add_cross_join(self, cross_join_settings: input_schema.NodeCrossJoin) -> "FlowGraph":
|
|
472
|
+
|
|
473
|
+
def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
|
|
474
|
+
for left_select in cross_join_settings.cross_join_input.left_select.renames:
|
|
475
|
+
left_select.is_available = True if left_select.old_name in main.schema else False
|
|
476
|
+
for right_select in cross_join_settings.cross_join_input.right_select.renames:
|
|
477
|
+
right_select.is_available = True if right_select.old_name in right.schema else False
|
|
478
|
+
|
|
479
|
+
return main.do_cross_join(cross_join_input=cross_join_settings.cross_join_input,
|
|
480
|
+
auto_generate_selection=cross_join_settings.auto_generate_selection,
|
|
481
|
+
verify_integrity=False,
|
|
482
|
+
other=right)
|
|
483
|
+
|
|
484
|
+
self.add_node_step(node_id=cross_join_settings.node_id,
|
|
485
|
+
function=_func,
|
|
486
|
+
input_columns=[],
|
|
487
|
+
node_type='cross_join',
|
|
488
|
+
setting_input=cross_join_settings)
|
|
489
|
+
return self
|
|
490
|
+
|
|
491
|
+
def add_join(self, join_settings: input_schema.NodeJoin) -> "FlowGraph":
|
|
492
|
+
def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
|
|
493
|
+
for left_select in join_settings.join_input.left_select.renames:
|
|
494
|
+
left_select.is_available = True if left_select.old_name in main.schema else False
|
|
495
|
+
for right_select in join_settings.join_input.right_select.renames:
|
|
496
|
+
right_select.is_available = True if right_select.old_name in right.schema else False
|
|
497
|
+
|
|
498
|
+
return main.join(join_input=join_settings.join_input,
|
|
499
|
+
auto_generate_selection=join_settings.auto_generate_selection,
|
|
500
|
+
verify_integrity=False,
|
|
501
|
+
other=right)
|
|
502
|
+
|
|
503
|
+
self.add_node_step(node_id=join_settings.node_id,
|
|
504
|
+
function=_func,
|
|
505
|
+
input_columns=[],
|
|
506
|
+
node_type='join',
|
|
507
|
+
setting_input=join_settings,
|
|
508
|
+
input_node_ids=join_settings.depending_on_ids)
|
|
509
|
+
return self
|
|
510
|
+
|
|
511
|
+
def add_fuzzy_match(self, fuzzy_settings: input_schema.NodeFuzzyMatch) -> "FlowGraph":
|
|
512
|
+
def _func(main: FlowDataEngine, right: FlowDataEngine) -> FlowDataEngine:
|
|
513
|
+
f = main.start_fuzzy_join(fuzzy_match_input=fuzzy_settings.join_input, other=right, file_ref=node.hash,
|
|
514
|
+
flow_id=self.flow_id, node_id=fuzzy_settings.node_id)
|
|
515
|
+
logger.info("Started the fuzzy match action")
|
|
516
|
+
node._fetch_cached_df = f
|
|
517
|
+
return FlowDataEngine(f.get_result())
|
|
518
|
+
|
|
519
|
+
self.add_node_step(node_id=fuzzy_settings.node_id,
|
|
520
|
+
function=_func,
|
|
521
|
+
input_columns=[],
|
|
522
|
+
node_type='fuzzy_match',
|
|
523
|
+
setting_input=fuzzy_settings)
|
|
524
|
+
node = self.get_node(node_id=fuzzy_settings.node_id)
|
|
525
|
+
|
|
526
|
+
def schema_callback():
|
|
527
|
+
return calculate_fuzzy_match_schema(fuzzy_settings.join_input,
|
|
528
|
+
left_schema=node.node_inputs.main_inputs[0].schema,
|
|
529
|
+
right_schema=node.node_inputs.right_input.schema
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
node.schema_callback = schema_callback
|
|
533
|
+
return self
|
|
534
|
+
|
|
535
|
+
def add_text_to_rows(self, node_text_to_rows: input_schema.NodeTextToRows) -> "FlowGraph":
|
|
536
|
+
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
537
|
+
return table.split(node_text_to_rows.text_to_rows_input)
|
|
538
|
+
|
|
539
|
+
self.add_node_step(node_id=node_text_to_rows.node_id,
|
|
540
|
+
function=_func,
|
|
541
|
+
node_type='text_to_rows',
|
|
542
|
+
setting_input=node_text_to_rows,
|
|
543
|
+
input_node_ids=[node_text_to_rows.depending_on_id])
|
|
544
|
+
return self
|
|
545
|
+
|
|
546
|
+
def add_sort(self, sort_settings: input_schema.NodeSort) -> "FlowGraph":
|
|
547
|
+
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
548
|
+
return table.do_sort(sort_settings.sort_input)
|
|
549
|
+
|
|
550
|
+
self.add_node_step(node_id=sort_settings.node_id,
|
|
551
|
+
function=_func,
|
|
552
|
+
node_type='sort',
|
|
553
|
+
setting_input=sort_settings,
|
|
554
|
+
input_node_ids=[sort_settings.depending_on_id])
|
|
555
|
+
return self
|
|
556
|
+
|
|
557
|
+
def add_sample(self, sample_settings: input_schema.NodeSample) -> "FlowGraph":
|
|
558
|
+
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
559
|
+
return table.get_sample(sample_settings.sample_size)
|
|
560
|
+
|
|
561
|
+
self.add_node_step(node_id=sample_settings.node_id,
|
|
562
|
+
function=_func,
|
|
563
|
+
node_type='sample',
|
|
564
|
+
setting_input=sample_settings,
|
|
565
|
+
input_node_ids=[sample_settings.depending_on_id]
|
|
566
|
+
)
|
|
567
|
+
return self
|
|
568
|
+
|
|
569
|
+
def add_record_id(self, record_id_settings: input_schema.NodeRecordId) -> "FlowGraph":
|
|
570
|
+
|
|
571
|
+
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
572
|
+
return table.add_record_id(record_id_settings.record_id_input)
|
|
573
|
+
|
|
574
|
+
self.add_node_step(node_id=record_id_settings.node_id,
|
|
575
|
+
function=_func,
|
|
576
|
+
node_type='record_id',
|
|
577
|
+
setting_input=record_id_settings,
|
|
578
|
+
input_node_ids=[record_id_settings.depending_on_id]
|
|
579
|
+
)
|
|
580
|
+
return self
|
|
581
|
+
|
|
582
|
+
def add_select(self, select_settings: input_schema.NodeSelect) -> "FlowGraph":
|
|
583
|
+
select_cols = select_settings.select_input
|
|
584
|
+
drop_cols = tuple(s.old_name for s in select_settings.select_input)
|
|
585
|
+
|
|
586
|
+
def _func(table: FlowDataEngine) -> FlowDataEngine:
|
|
587
|
+
input_cols = set(f.name for f in table.schema)
|
|
588
|
+
ids_to_remove = []
|
|
589
|
+
for i, select_col in enumerate(select_cols):
|
|
590
|
+
if select_col.old_name not in input_cols:
|
|
591
|
+
select_col.is_available = False
|
|
592
|
+
if not select_col.keep:
|
|
593
|
+
ids_to_remove.append(i)
|
|
594
|
+
else:
|
|
595
|
+
select_col.is_available = True
|
|
596
|
+
ids_to_remove.reverse()
|
|
597
|
+
for i in ids_to_remove:
|
|
598
|
+
v = select_cols.pop(i)
|
|
599
|
+
del v
|
|
600
|
+
return table.do_select(select_inputs=transform_schema.SelectInputs(select_cols),
|
|
601
|
+
keep_missing=select_settings.keep_missing)
|
|
602
|
+
|
|
603
|
+
self.add_node_step(node_id=select_settings.node_id,
|
|
604
|
+
function=_func,
|
|
605
|
+
input_columns=[],
|
|
606
|
+
node_type='select',
|
|
607
|
+
drop_columns=list(drop_cols),
|
|
608
|
+
setting_input=select_settings,
|
|
609
|
+
input_node_ids=[select_settings.depending_on_id])
|
|
610
|
+
return self
|
|
611
|
+
|
|
612
|
+
@property
|
|
613
|
+
def graph_has_functions(self) -> bool:
|
|
614
|
+
return len(self._node_ids) > 0
|
|
615
|
+
|
|
616
|
+
def delete_node(self, node_id: Union[int, str]):
|
|
617
|
+
logger.info(f"Starting deletion of node with ID: {node_id}")
|
|
618
|
+
|
|
619
|
+
node = self._node_db.get(node_id)
|
|
620
|
+
if node:
|
|
621
|
+
logger.info(f"Found node: {node_id}, processing deletion")
|
|
622
|
+
|
|
623
|
+
lead_to_steps: List[FlowNode] = node.leads_to_nodes
|
|
624
|
+
logger.debug(f"Node {node_id} leads to {len(lead_to_steps)} other nodes")
|
|
625
|
+
|
|
626
|
+
if len(lead_to_steps) > 0:
|
|
627
|
+
for lead_to_step in lead_to_steps:
|
|
628
|
+
logger.debug(f"Deleting input node {node_id} from dependent node {lead_to_step}")
|
|
629
|
+
lead_to_step.delete_input_node(node_id, complete=True)
|
|
630
|
+
|
|
631
|
+
if not node.is_start:
|
|
632
|
+
depends_on: List[FlowNode] = node.node_inputs.get_all_inputs()
|
|
633
|
+
logger.debug(f"Node {node_id} depends on {len(depends_on)} other nodes")
|
|
634
|
+
|
|
635
|
+
for depend_on in depends_on:
|
|
636
|
+
logger.debug(f"Removing lead_to reference {node_id} from node {depend_on}")
|
|
637
|
+
depend_on.delete_lead_to_node(node_id)
|
|
638
|
+
|
|
639
|
+
self._node_db.pop(node_id)
|
|
640
|
+
logger.debug(f"Successfully removed node {node_id} from node_db")
|
|
641
|
+
del node
|
|
642
|
+
logger.info("Node object deleted")
|
|
643
|
+
else:
|
|
644
|
+
logger.error(f"Failed to find node with id {node_id}")
|
|
645
|
+
raise Exception(f"Node with id {node_id} does not exist")
|
|
646
|
+
|
|
647
|
+
@property
|
|
648
|
+
def graph_has_input_data(self) -> bool:
|
|
649
|
+
return self._input_data is not None
|
|
650
|
+
|
|
651
|
+
def add_node_step(self,
|
|
652
|
+
node_id: Union[int, str],
|
|
653
|
+
function: Callable,
|
|
654
|
+
input_columns: List[str] = None,
|
|
655
|
+
output_schema: List[FlowfileColumn] = None,
|
|
656
|
+
node_type: str = None,
|
|
657
|
+
drop_columns: List[str] = None,
|
|
658
|
+
renew_schema: bool = True,
|
|
659
|
+
setting_input: Any = None,
|
|
660
|
+
cache_results: bool = None,
|
|
661
|
+
schema_callback: Callable = None,
|
|
662
|
+
input_node_ids: List[int] = None):
|
|
663
|
+
existing_node = self.get_node(node_id)
|
|
664
|
+
if existing_node is not None:
|
|
665
|
+
if existing_node.node_type != node_type:
|
|
666
|
+
self.delete_node(existing_node.node_id)
|
|
667
|
+
existing_node = None
|
|
668
|
+
if existing_node:
|
|
669
|
+
input_nodes = existing_node.all_inputs
|
|
670
|
+
elif input_node_ids is not None:
|
|
671
|
+
input_nodes = [self.get_node(node_id) for node_id in input_node_ids]
|
|
672
|
+
else:
|
|
673
|
+
input_nodes = None
|
|
674
|
+
if cache_results is None:
|
|
675
|
+
if hasattr(setting_input, 'cache_results'):
|
|
676
|
+
cache_results = getattr(setting_input, 'cache_results')
|
|
677
|
+
cache_results = False if cache_results is None else cache_results
|
|
678
|
+
if isinstance(input_columns, str):
|
|
679
|
+
input_columns = [input_columns]
|
|
680
|
+
|
|
681
|
+
if input_nodes is not None or function.__name__ in ('placeholder', 'analysis_preparation'):
|
|
682
|
+
|
|
683
|
+
if not existing_node:
|
|
684
|
+
node = FlowNode(node_id=node_id,
|
|
685
|
+
function=function,
|
|
686
|
+
output_schema=output_schema,
|
|
687
|
+
input_columns=input_columns,
|
|
688
|
+
drop_columns=drop_columns,
|
|
689
|
+
renew_schema=renew_schema,
|
|
690
|
+
setting_input=setting_input,
|
|
691
|
+
node_type=node_type,
|
|
692
|
+
name=function.__name__,
|
|
693
|
+
schema_callback=schema_callback,
|
|
694
|
+
parent_uuid=self.uuid)
|
|
695
|
+
else:
|
|
696
|
+
existing_node.update_node(function=function,
|
|
697
|
+
output_schema=output_schema,
|
|
698
|
+
input_columns=input_columns,
|
|
699
|
+
drop_columns=drop_columns,
|
|
700
|
+
setting_input=setting_input,
|
|
701
|
+
schema_callback=schema_callback)
|
|
702
|
+
node = existing_node
|
|
703
|
+
elif node_type == 'input_data':
|
|
704
|
+
node = None
|
|
705
|
+
else:
|
|
706
|
+
raise Exception("No data initialized")
|
|
707
|
+
self._node_db[node_id] = node
|
|
708
|
+
self._node_ids.append(node_id)
|
|
709
|
+
|
|
710
|
+
def add_include_cols(self, include_columns: List[str]):
|
|
711
|
+
for column in include_columns:
|
|
712
|
+
if column not in self._input_cols:
|
|
713
|
+
self._input_cols.append(column)
|
|
714
|
+
if column not in self._output_cols:
|
|
715
|
+
self._output_cols.append(column)
|
|
716
|
+
return self
|
|
717
|
+
|
|
718
|
+
def add_output(self, output_file: input_schema.NodeOutput):
|
|
719
|
+
def _func(df: FlowDataEngine):
|
|
720
|
+
execute_remote = self.execution_location != 'local'
|
|
721
|
+
df.output(output_fs=output_file.output_settings, flow_id=self.flow_id, node_id=output_file.node_id,
|
|
722
|
+
execute_remote=execute_remote)
|
|
723
|
+
return df
|
|
724
|
+
|
|
725
|
+
def schema_callback():
|
|
726
|
+
input_node: FlowNode = self.get_node(output_file.node_id).node_inputs.main_inputs[0]
|
|
727
|
+
|
|
728
|
+
return input_node.schema
|
|
729
|
+
|
|
730
|
+
self.add_node_step(node_id=output_file.node_id,
|
|
731
|
+
function=_func,
|
|
732
|
+
input_columns=[],
|
|
733
|
+
node_type='output',
|
|
734
|
+
setting_input=output_file,
|
|
735
|
+
schema_callback=schema_callback,
|
|
736
|
+
input_node_ids=[output_file.depending_on_id])
|
|
737
|
+
|
|
738
|
+
def add_database_writer(self, node_database_writer: input_schema.NodeDatabaseWriter):
|
|
739
|
+
logger.info("Adding database reader")
|
|
740
|
+
node_type = 'database_writer'
|
|
741
|
+
database_settings: input_schema.DatabaseWriteSettings = node_database_writer.database_write_settings
|
|
742
|
+
database_connection: Optional[input_schema.DatabaseConnection | input_schema.FullDatabaseConnection]
|
|
743
|
+
if database_settings.connection_mode == 'inline':
|
|
744
|
+
database_connection: input_schema.DatabaseConnection = database_settings.database_connection
|
|
745
|
+
encrypted_password = get_encrypted_secret(current_user_id=node_database_writer.user_id,
|
|
746
|
+
secret_name=database_connection.password_ref)
|
|
747
|
+
if encrypted_password is None:
|
|
748
|
+
raise HTTPException(status_code=400, detail="Password not found")
|
|
749
|
+
else:
|
|
750
|
+
database_reference_settings = get_local_database_connection(database_settings.database_connection_name,
|
|
751
|
+
node_database_writer.user_id)
|
|
752
|
+
encrypted_password = database_reference_settings.password.get_secret_value()
|
|
753
|
+
|
|
754
|
+
def _func(df: FlowDataEngine):
|
|
755
|
+
df.lazy = True
|
|
756
|
+
database_external_write_settings = (
|
|
757
|
+
sql_models.DatabaseExternalWriteSettings.create_from_from_node_database_writer(
|
|
758
|
+
node_database_writer=node_database_writer,
|
|
759
|
+
password=encrypted_password,
|
|
760
|
+
table_name=(database_settings.schema_name+'.'+database_settings.table_name
|
|
761
|
+
if database_settings.schema_name else database_settings.table_name),
|
|
762
|
+
database_reference_settings=(database_reference_settings if database_settings.connection_mode == 'reference'
|
|
763
|
+
else None),
|
|
764
|
+
lf=df.data_frame
|
|
765
|
+
)
|
|
766
|
+
)
|
|
767
|
+
external_database_writer = ExternalDatabaseWriter(database_external_write_settings, wait_on_completion=False)
|
|
768
|
+
node._fetch_cached_df = external_database_writer
|
|
769
|
+
external_database_writer.get_result()
|
|
770
|
+
return df
|
|
771
|
+
|
|
772
|
+
def schema_callback():
|
|
773
|
+
input_node: FlowNode = self.get_node(node_database_writer.node_id).node_inputs.main_inputs[0]
|
|
774
|
+
return input_node.schema
|
|
775
|
+
|
|
776
|
+
self.add_node_step(
|
|
777
|
+
node_id=node_database_writer.node_id,
|
|
778
|
+
function=_func,
|
|
779
|
+
input_columns=[],
|
|
780
|
+
node_type=node_type,
|
|
781
|
+
setting_input=node_database_writer,
|
|
782
|
+
schema_callback=schema_callback,
|
|
783
|
+
)
|
|
784
|
+
node = self.get_node(node_database_writer.node_id)
|
|
785
|
+
|
|
786
|
+
def add_database_reader(self, node_database_reader: input_schema.NodeDatabaseReader):
|
|
787
|
+
logger.info("Adding database reader")
|
|
788
|
+
node_type = 'database_reader'
|
|
789
|
+
database_settings: input_schema.DatabaseSettings = node_database_reader.database_settings
|
|
790
|
+
database_connection: Optional[input_schema.DatabaseConnection | input_schema.FullDatabaseConnection]
|
|
791
|
+
if database_settings.connection_mode == 'inline':
|
|
792
|
+
database_connection: input_schema.DatabaseConnection = database_settings.database_connection
|
|
793
|
+
encrypted_password = get_encrypted_secret(current_user_id=node_database_reader.user_id,
|
|
794
|
+
secret_name=database_connection.password_ref)
|
|
795
|
+
if encrypted_password is None:
|
|
796
|
+
raise HTTPException(status_code=400, detail="Password not found")
|
|
797
|
+
else:
|
|
798
|
+
database_reference_settings = get_local_database_connection(database_settings.database_connection_name,
|
|
799
|
+
node_database_reader.user_id)
|
|
800
|
+
database_connection = database_reference_settings
|
|
801
|
+
encrypted_password = database_reference_settings.password.get_secret_value()
|
|
802
|
+
|
|
803
|
+
def _func():
|
|
804
|
+
sql_source = BaseSqlSource(query=None if database_settings.query_mode == 'table' else database_settings.query,
|
|
805
|
+
table_name=database_settings.table_name,
|
|
806
|
+
schema_name=database_settings.schema_name,
|
|
807
|
+
fields=node_database_reader.fields,
|
|
808
|
+
)
|
|
809
|
+
database_external_read_settings = (
|
|
810
|
+
sql_models.DatabaseExternalReadSettings.create_from_from_node_database_reader(
|
|
811
|
+
node_database_reader=node_database_reader,
|
|
812
|
+
password=encrypted_password,
|
|
813
|
+
query=sql_source.query,
|
|
814
|
+
database_reference_settings=(database_reference_settings if database_settings.connection_mode == 'reference'
|
|
815
|
+
else None),
|
|
816
|
+
)
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
external_database_fetcher = ExternalDatabaseFetcher(database_external_read_settings, wait_on_completion=False)
|
|
820
|
+
node._fetch_cached_df = external_database_fetcher
|
|
821
|
+
fl = FlowDataEngine(external_database_fetcher.get_result())
|
|
822
|
+
node_database_reader.fields = [c.get_minimal_field_info() for c in fl.schema]
|
|
823
|
+
return fl
|
|
824
|
+
|
|
825
|
+
def schema_callback():
|
|
826
|
+
sql_source = SqlSource(connection_string=
|
|
827
|
+
sql_utils.construct_sql_uri(database_type=database_connection.database_type,
|
|
828
|
+
host=database_connection.host,
|
|
829
|
+
port=database_connection.port,
|
|
830
|
+
database=database_connection.database,
|
|
831
|
+
username=database_connection.username,
|
|
832
|
+
password=decrypt_secret(encrypted_password)),
|
|
833
|
+
query=None if database_settings.query_mode == 'table' else database_settings.query,
|
|
834
|
+
table_name=database_settings.table_name,
|
|
835
|
+
schema_name=database_settings.schema_name,
|
|
836
|
+
fields=node_database_reader.fields,
|
|
837
|
+
)
|
|
838
|
+
return sql_source.get_schema()
|
|
839
|
+
|
|
840
|
+
node = self.get_node(node_database_reader.node_id)
|
|
841
|
+
if node:
|
|
842
|
+
node.node_type = node_type
|
|
843
|
+
node.name = node_type
|
|
844
|
+
node.function = _func
|
|
845
|
+
node.setting_input = node_database_reader
|
|
846
|
+
node.node_settings.cache_results = node_database_reader.cache_results
|
|
847
|
+
if node_database_reader.node_id not in set(start_node.node_id for start_node in self._flow_starts):
|
|
848
|
+
self._flow_starts.append(node)
|
|
849
|
+
node.schema_callback = schema_callback
|
|
850
|
+
else:
|
|
851
|
+
node = FlowNode(node_database_reader.node_id, function=_func,
|
|
852
|
+
setting_input=node_database_reader,
|
|
853
|
+
name=node_type, node_type=node_type, parent_uuid=self.uuid,
|
|
854
|
+
schema_callback=schema_callback)
|
|
855
|
+
self._node_db[node_database_reader.node_id] = node
|
|
856
|
+
self._flow_starts.append(node)
|
|
857
|
+
self._node_ids.append(node_database_reader.node_id)
|
|
858
|
+
|
|
859
|
+
def add_airbyte_reader(self, external_source_input: input_schema.NodeAirbyteReader):
|
|
860
|
+
logger.info('Adding airbyte reader')
|
|
861
|
+
node_type = 'airbyte_reader'
|
|
862
|
+
source_settings: input_schema.AirbyteReader = external_source_input.source_settings
|
|
863
|
+
airbyte_settings = airbyte_settings_from_config(source_settings, flow_id=self.flow_id,
|
|
864
|
+
node_id=external_source_input.node_id)
|
|
865
|
+
|
|
866
|
+
logger.info("Airbyte settings created")
|
|
867
|
+
airbyte_settings.fields = source_settings.fields
|
|
868
|
+
external_source = data_source_factory(source_type='airbyte', airbyte_settings=airbyte_settings)
|
|
869
|
+
|
|
870
|
+
def _func():
|
|
871
|
+
logger.info('Calling external source')
|
|
872
|
+
external_fetcher = ExternalAirbyteFetcher(airbyte_settings, wait_on_completion=False)
|
|
873
|
+
node._fetch_cached_df = external_fetcher
|
|
874
|
+
fl = FlowDataEngine(external_fetcher.get_result())
|
|
875
|
+
external_source_input.source_settings.fields = [c.get_minimal_field_info() for c in fl.schema]
|
|
876
|
+
return fl
|
|
877
|
+
|
|
878
|
+
def schema_callback():
|
|
879
|
+
return [FlowfileColumn.from_input(f.name, f.data_type) for f in external_source.schema]
|
|
880
|
+
|
|
881
|
+
node = self.get_node(external_source_input.node_id)
|
|
882
|
+
if node:
|
|
883
|
+
node.node_type = node_type
|
|
884
|
+
node.name = node_type
|
|
885
|
+
node.function = _func
|
|
886
|
+
node.setting_input = external_source_input
|
|
887
|
+
node.node_settings.cache_results = external_source_input.cache_results
|
|
888
|
+
if external_source_input.node_id not in set(start_node.node_id for start_node in self._flow_starts):
|
|
889
|
+
self._flow_starts.append(node)
|
|
890
|
+
node.schema_callback = schema_callback
|
|
891
|
+
else:
|
|
892
|
+
node = FlowNode(external_source_input.node_id, function=_func,
|
|
893
|
+
setting_input=external_source_input,
|
|
894
|
+
name=node_type, node_type=node_type, parent_uuid=self.uuid,
|
|
895
|
+
schema_callback=schema_callback)
|
|
896
|
+
self._node_db[external_source_input.node_id] = node
|
|
897
|
+
self._flow_starts.append(node)
|
|
898
|
+
self._node_ids.append(external_source_input.node_id)
|
|
899
|
+
if external_source_input.source_settings.fields and len(external_source_input.source_settings.fields) > 0:
|
|
900
|
+
logger.info('Using provided schema in the node')
|
|
901
|
+
|
|
902
|
+
def add_google_sheet(self, external_source_input: input_schema.NodeExternalSource):
|
|
903
|
+
logger.info('Adding google sheet reader')
|
|
904
|
+
self.add_external_source(external_source_input)
|
|
905
|
+
|
|
906
|
+
def add_sql_source(self, external_source_input: input_schema.NodeExternalSource):
|
|
907
|
+
logger.info('Adding sql source')
|
|
908
|
+
self.add_external_source(external_source_input)
|
|
909
|
+
|
|
910
|
+
def add_external_source(self,
|
|
911
|
+
external_source_input: input_schema.NodeExternalSource | input_schema.NodeAirbyteReader):
|
|
912
|
+
|
|
913
|
+
custom_source_type = external_source_input.identifier != 'airbyte'
|
|
914
|
+
if custom_source_type:
|
|
915
|
+
node_type = 'external_source'
|
|
916
|
+
external_source_script = getattr(external_sources.custom_external_sources, external_source_input.identifier)
|
|
917
|
+
source_settings = (getattr(input_schema, snake_case_to_camel_case(external_source_input.identifier)).
|
|
918
|
+
model_validate(external_source_input.source_settings))
|
|
919
|
+
if hasattr(external_source_script, 'initial_getter'):
|
|
920
|
+
initial_getter = getattr(external_source_script, 'initial_getter')(source_settings)
|
|
921
|
+
else:
|
|
922
|
+
initial_getter = None
|
|
923
|
+
data_getter = external_source_script.getter(source_settings)
|
|
924
|
+
external_source = data_source_factory(source_type='custom',
|
|
925
|
+
data_getter=data_getter,
|
|
926
|
+
initial_data_getter=initial_getter,
|
|
927
|
+
orientation=external_source_input.source_settings.orientation,
|
|
928
|
+
schema=None)
|
|
929
|
+
else:
|
|
930
|
+
node_type = 'airbyte_reader'
|
|
931
|
+
source_settings: input_schema.AirbyteReader = external_source_input.source_settings
|
|
932
|
+
airbyte_settings = airbyte_settings_from_config(source_settings, flow_id=self.flow_id,
|
|
933
|
+
node_id=external_source_input.node_id)
|
|
934
|
+
airbyte_settings.fields = source_settings.fields
|
|
935
|
+
external_source = data_source_factory(source_type='airbyte', airbyte_settings=airbyte_settings)
|
|
936
|
+
|
|
937
|
+
def _func():
|
|
938
|
+
logger.info('Calling external source')
|
|
939
|
+
fl = FlowDataEngine.create_from_external_source(external_source=external_source)
|
|
940
|
+
external_source_input.source_settings.fields = [c.get_minimal_field_info() for c in fl.schema]
|
|
941
|
+
return fl
|
|
942
|
+
|
|
943
|
+
node = self.get_node(external_source_input.node_id)
|
|
944
|
+
if node:
|
|
945
|
+
node.node_type = node_type
|
|
946
|
+
node.name = node_type
|
|
947
|
+
node.function = _func
|
|
948
|
+
node.setting_input = external_source_input
|
|
949
|
+
node.node_settings.cache_results = external_source_input.cache_results
|
|
950
|
+
if external_source_input.node_id not in set(start_node.node_id for start_node in self._flow_starts):
|
|
951
|
+
self._flow_starts.append(node)
|
|
952
|
+
else:
|
|
953
|
+
node = FlowNode(external_source_input.node_id, function=_func,
|
|
954
|
+
setting_input=external_source_input,
|
|
955
|
+
name=node_type, node_type=node_type, parent_uuid=self.uuid)
|
|
956
|
+
self._node_db[external_source_input.node_id] = node
|
|
957
|
+
self._flow_starts.append(node)
|
|
958
|
+
self._node_ids.append(external_source_input.node_id)
|
|
959
|
+
if external_source_input.source_settings.fields and len(external_source_input.source_settings.fields) > 0:
|
|
960
|
+
logger.info('Using provided schema in the node')
|
|
961
|
+
|
|
962
|
+
def schema_callback():
|
|
963
|
+
return [FlowfileColumn.from_input(f.name, f.data_type) for f in
|
|
964
|
+
external_source_input.source_settings.fields]
|
|
965
|
+
|
|
966
|
+
node.schema_callback = schema_callback
|
|
967
|
+
else:
|
|
968
|
+
logger.warning('Removing schema')
|
|
969
|
+
node._schema_callback = None
|
|
970
|
+
self.add_node_step(node_id=external_source_input.node_id,
|
|
971
|
+
function=_func,
|
|
972
|
+
input_columns=[],
|
|
973
|
+
node_type=node_type,
|
|
974
|
+
setting_input=external_source_input)
|
|
975
|
+
|
|
976
|
+
def add_read(self, input_file: input_schema.NodeRead):
|
|
977
|
+
if input_file.received_file.file_type in ('xlsx', 'excel') and input_file.received_file.sheet_name == '':
|
|
978
|
+
sheet_name = fastexcel.read_excel(input_file.received_file.path).sheet_names[0]
|
|
979
|
+
input_file.received_file.sheet_name = sheet_name
|
|
980
|
+
|
|
981
|
+
received_file = input_file.received_file
|
|
982
|
+
input_file.received_file.set_absolute_filepath()
|
|
983
|
+
|
|
984
|
+
def _func():
|
|
985
|
+
if input_file.received_file.file_type == 'parquet':
|
|
986
|
+
input_data = FlowDataEngine.create_from_path(input_file.received_file)
|
|
987
|
+
elif input_file.received_file.file_type == 'csv' and 'utf' in input_file.received_file.encoding:
|
|
988
|
+
input_data = FlowDataEngine.create_from_path(input_file.received_file)
|
|
989
|
+
else:
|
|
990
|
+
input_data = FlowDataEngine.create_from_path_worker(input_file.received_file,
|
|
991
|
+
node_id=input_file.node_id,
|
|
992
|
+
flow_id=self.flow_id)
|
|
993
|
+
input_data.name = input_file.received_file.name
|
|
994
|
+
return input_data
|
|
995
|
+
|
|
996
|
+
node = self.get_node(input_file.node_id)
|
|
997
|
+
schema_callback = None
|
|
998
|
+
if node:
|
|
999
|
+
start_hash = node.hash
|
|
1000
|
+
node.node_type = 'read'
|
|
1001
|
+
node.name = 'read'
|
|
1002
|
+
node.function = _func
|
|
1003
|
+
node.setting_input = input_file
|
|
1004
|
+
if input_file.node_id not in set(start_node.node_id for start_node in self._flow_starts):
|
|
1005
|
+
self._flow_starts.append(node)
|
|
1006
|
+
|
|
1007
|
+
if start_hash != node.hash:
|
|
1008
|
+
logger.info('Hash changed, updating schema')
|
|
1009
|
+
if len(received_file.fields) > 0:
|
|
1010
|
+
# If the file has fields defined, we can use them to create the schema
|
|
1011
|
+
def schema_callback():
|
|
1012
|
+
return [FlowfileColumn.from_input(f.name, f.data_type) for f in received_file.fields]
|
|
1013
|
+
|
|
1014
|
+
elif input_file.received_file.file_type in ('csv', 'json', 'parquet'):
|
|
1015
|
+
# everything that can be scanned by polars
|
|
1016
|
+
def schema_callback():
|
|
1017
|
+
input_data = FlowDataEngine.create_from_path(input_file.received_file)
|
|
1018
|
+
return input_data.schema
|
|
1019
|
+
|
|
1020
|
+
elif input_file.received_file.file_type in ('xlsx', 'excel'):
|
|
1021
|
+
# If the file is an Excel file, we need to use the openpyxl engine to read the schema
|
|
1022
|
+
schema_callback = get_xlsx_schema_callback(engine='openpyxl',
|
|
1023
|
+
file_path=received_file.file_path,
|
|
1024
|
+
sheet_name=received_file.sheet_name,
|
|
1025
|
+
start_row=received_file.start_row,
|
|
1026
|
+
end_row=received_file.end_row,
|
|
1027
|
+
start_column=received_file.start_column,
|
|
1028
|
+
end_column=received_file.end_column,
|
|
1029
|
+
has_headers=received_file.has_headers)
|
|
1030
|
+
else:
|
|
1031
|
+
schema_callback = None
|
|
1032
|
+
else:
|
|
1033
|
+
node = FlowNode(input_file.node_id, function=_func,
|
|
1034
|
+
setting_input=input_file,
|
|
1035
|
+
name='read', node_type='read', parent_uuid=self.uuid)
|
|
1036
|
+
self._node_db[input_file.node_id] = node
|
|
1037
|
+
self._flow_starts.append(node)
|
|
1038
|
+
self._node_ids.append(input_file.node_id)
|
|
1039
|
+
|
|
1040
|
+
if schema_callback is not None:
|
|
1041
|
+
node.schema_callback = schema_callback
|
|
1042
|
+
return self
|
|
1043
|
+
|
|
1044
|
+
def add_datasource(self, input_file: input_schema.NodeDatasource | input_schema.NodeManualInput):
|
|
1045
|
+
|
|
1046
|
+
if isinstance(input_file, input_schema.NodeManualInput):
|
|
1047
|
+
input_data = FlowDataEngine(input_file.raw_data)
|
|
1048
|
+
ref = 'manual_input'
|
|
1049
|
+
|
|
1050
|
+
else:
|
|
1051
|
+
input_data = FlowDataEngine(path_ref=input_file.file_ref)
|
|
1052
|
+
ref = 'datasource'
|
|
1053
|
+
node = self.get_node(input_file.node_id)
|
|
1054
|
+
if node:
|
|
1055
|
+
node.node_type = ref
|
|
1056
|
+
node.name = ref
|
|
1057
|
+
node.function = input_data
|
|
1058
|
+
node.setting_input = input_file
|
|
1059
|
+
|
|
1060
|
+
if not input_file.node_id in set(start_node.node_id for start_node in self._flow_starts):
|
|
1061
|
+
self._flow_starts.append(node)
|
|
1062
|
+
else:
|
|
1063
|
+
node = FlowNode(input_file.node_id, function=input_data,
|
|
1064
|
+
setting_input=input_file,
|
|
1065
|
+
name=ref, node_type=ref, parent_uuid=self.uuid)
|
|
1066
|
+
self._node_db[input_file.node_id] = node
|
|
1067
|
+
self._flow_starts.append(node)
|
|
1068
|
+
self._node_ids.append(input_file.node_id)
|
|
1069
|
+
return self
|
|
1070
|
+
|
|
1071
|
+
def add_manual_input(self, input_file: input_schema.NodeManualInput):
|
|
1072
|
+
self.add_datasource(input_file)
|
|
1073
|
+
|
|
1074
|
+
@property
|
|
1075
|
+
def nodes(self) -> List[FlowNode]:
|
|
1076
|
+
return list(self._node_db.values())
|
|
1077
|
+
|
|
1078
|
+
def check_for_missed_cols(self, expected_cols: List):
|
|
1079
|
+
not_filled_cols = set(expected_cols) - set(self._output_cols)
|
|
1080
|
+
cols_available = list(not_filled_cols & set([c.name for c in self._input_data.schema]))
|
|
1081
|
+
self._output_cols += cols_available
|
|
1082
|
+
|
|
1083
|
+
@property
|
|
1084
|
+
def input_data_columns(self) -> List[str]:
|
|
1085
|
+
if self._input_cols:
|
|
1086
|
+
return list(set([col for col in self._input_cols if
|
|
1087
|
+
col in [table_col.name for table_col in self._input_data.schema]]))
|
|
1088
|
+
|
|
1089
|
+
@property
|
|
1090
|
+
def execution_mode(self) -> str:
|
|
1091
|
+
return self.flow_settings.execution_mode
|
|
1092
|
+
|
|
1093
|
+
def get_implicit_starter_nodes(self) -> List[FlowNode]:
|
|
1094
|
+
"""Ensures that nodes that can be a start (e.g. polars code), will be a starting node"""
|
|
1095
|
+
starting_node_ids = [node.node_id for node in self._flow_starts]
|
|
1096
|
+
implicit_starting_nodes = []
|
|
1097
|
+
for node in self.nodes:
|
|
1098
|
+
if node.node_template.can_be_start and not node.has_input and node.node_id not in starting_node_ids:
|
|
1099
|
+
implicit_starting_nodes.append(node)
|
|
1100
|
+
return implicit_starting_nodes
|
|
1101
|
+
|
|
1102
|
+
@execution_mode.setter
|
|
1103
|
+
def execution_mode(self, mode: str):
|
|
1104
|
+
self.flow_settings.execution_mode = mode
|
|
1105
|
+
|
|
1106
|
+
@property
|
|
1107
|
+
def execution_location(self) -> schemas.ExecutionLocationsLiteral:
|
|
1108
|
+
return self.flow_settings.execution_location
|
|
1109
|
+
|
|
1110
|
+
@execution_location.setter
|
|
1111
|
+
def execution_location(self, execution_location: schemas.ExecutionLocationsLiteral):
|
|
1112
|
+
self.flow_settings.execution_location = execution_location
|
|
1113
|
+
|
|
1114
|
+
def run_graph(self):
|
|
1115
|
+
if self.flow_settings.is_running:
|
|
1116
|
+
raise Exception('Flow is already running')
|
|
1117
|
+
try:
|
|
1118
|
+
self.flow_settings.is_running = True
|
|
1119
|
+
self.flow_settings.is_canceled = False
|
|
1120
|
+
self.flow_logger.clear_log_file()
|
|
1121
|
+
self.nodes_completed = 0
|
|
1122
|
+
self.node_results = []
|
|
1123
|
+
self.start_datetime = datetime.datetime.now()
|
|
1124
|
+
self.end_datetime = None
|
|
1125
|
+
self.latest_run_info = None
|
|
1126
|
+
self.flow_logger.info('Starting to run flowfile flow...')
|
|
1127
|
+
skip_nodes = [node for node in self.nodes if not node.is_correct]
|
|
1128
|
+
skip_nodes.extend([lead_to_node for node in skip_nodes for lead_to_node in node.leads_to_nodes])
|
|
1129
|
+
execution_order = determine_execution_order(all_nodes=[node for node in self.nodes if
|
|
1130
|
+
node not in skip_nodes],
|
|
1131
|
+
flow_starts=self._flow_starts+self.get_implicit_starter_nodes())
|
|
1132
|
+
|
|
1133
|
+
skip_node_message(self.flow_logger, skip_nodes)
|
|
1134
|
+
execution_order_message(self.flow_logger, execution_order)
|
|
1135
|
+
performance_mode = self.flow_settings.execution_mode == 'Performance'
|
|
1136
|
+
for node in execution_order:
|
|
1137
|
+
node_logger = self.flow_logger.get_node_logger(node.node_id)
|
|
1138
|
+
if self.flow_settings.is_canceled:
|
|
1139
|
+
self.flow_logger.info('Flow canceled')
|
|
1140
|
+
break
|
|
1141
|
+
if node in skip_nodes:
|
|
1142
|
+
node_logger.info(f'Skipping node {node.node_id}')
|
|
1143
|
+
continue
|
|
1144
|
+
node_result = NodeResult(node_id=node.node_id, node_name=node.name)
|
|
1145
|
+
self.node_results.append(node_result)
|
|
1146
|
+
logger.info(f'Starting to run: node {node.node_id}, start time: {node_result.start_timestamp}')
|
|
1147
|
+
node.execute_node(run_location=self.flow_settings.execution_location,
|
|
1148
|
+
performance_mode=performance_mode,
|
|
1149
|
+
node_logger=node_logger)
|
|
1150
|
+
try:
|
|
1151
|
+
node_result.error = str(node.results.errors)
|
|
1152
|
+
if self.flow_settings.is_canceled:
|
|
1153
|
+
node_result.success = None
|
|
1154
|
+
node_result.success = None
|
|
1155
|
+
node_result.is_running = False
|
|
1156
|
+
continue
|
|
1157
|
+
node_result.success = node.results.errors is None
|
|
1158
|
+
node_result.end_timestamp = time()
|
|
1159
|
+
node_result.run_time = node_result.end_timestamp - node_result.start_timestamp
|
|
1160
|
+
node_result.is_running = False
|
|
1161
|
+
except Exception as e:
|
|
1162
|
+
node_result.error = 'Node did not run'
|
|
1163
|
+
node_result.success = False
|
|
1164
|
+
node_result.end_timestamp = time()
|
|
1165
|
+
node_result.run_time = node_result.end_timestamp - node_result.start_timestamp
|
|
1166
|
+
node_result.is_running = False
|
|
1167
|
+
node_logger.error(f'Error in node {node.node_id}: {e}')
|
|
1168
|
+
if not node_result.success:
|
|
1169
|
+
skip_nodes.extend(list(node.get_all_dependent_nodes()))
|
|
1170
|
+
node_logger.info(f'Completed node with success: {node_result.success}')
|
|
1171
|
+
self.nodes_completed += 1
|
|
1172
|
+
self.flow_logger.info('Flow completed!')
|
|
1173
|
+
self.end_datetime = datetime.datetime.now()
|
|
1174
|
+
self.flow_settings.is_running = False
|
|
1175
|
+
if self.flow_settings.is_canceled:
|
|
1176
|
+
self.flow_logger.info('Flow canceled')
|
|
1177
|
+
return self.get_run_info()
|
|
1178
|
+
except Exception as e:
|
|
1179
|
+
raise e
|
|
1180
|
+
finally:
|
|
1181
|
+
self.flow_settings.is_running = False
|
|
1182
|
+
|
|
1183
|
+
def get_run_info(self) -> RunInformation:
|
|
1184
|
+
if self.latest_run_info is None:
|
|
1185
|
+
node_results = self.node_results
|
|
1186
|
+
success = all(nr.success for nr in node_results)
|
|
1187
|
+
self.latest_run_info = RunInformation(start_time=self.start_datetime, end_time=self.end_datetime,
|
|
1188
|
+
success=success,
|
|
1189
|
+
node_step_result=node_results, flow_id=self.flow_id,
|
|
1190
|
+
nodes_completed=self.nodes_completed,
|
|
1191
|
+
number_of_nodes=len(self.nodes))
|
|
1192
|
+
elif self.latest_run_info.nodes_completed != self.nodes_completed:
|
|
1193
|
+
node_results = self.node_results
|
|
1194
|
+
self.latest_run_info = RunInformation(start_time=self.start_datetime, end_time=self.end_datetime,
|
|
1195
|
+
success=all(nr.success for nr in node_results),
|
|
1196
|
+
node_step_result=node_results, flow_id=self.flow_id,
|
|
1197
|
+
nodes_completed=self.nodes_completed,
|
|
1198
|
+
number_of_nodes=len(self.nodes))
|
|
1199
|
+
return self.latest_run_info
|
|
1200
|
+
|
|
1201
|
+
@property
|
|
1202
|
+
def node_connections(self) -> List[Tuple[int, int]]:
|
|
1203
|
+
connections = set()
|
|
1204
|
+
for node in self.nodes:
|
|
1205
|
+
outgoing_connections = [(node.node_id, ltn.node_id) for ltn in node.leads_to_nodes]
|
|
1206
|
+
incoming_connections = [(don.node_id, node.node_id) for don in node.all_inputs]
|
|
1207
|
+
node_connections = [c for c in outgoing_connections + incoming_connections if (c[0] is not None
|
|
1208
|
+
and c[1] is not None)]
|
|
1209
|
+
for node_connection in node_connections:
|
|
1210
|
+
if node_connection not in connections:
|
|
1211
|
+
connections.add(node_connection)
|
|
1212
|
+
return list(connections)
|
|
1213
|
+
|
|
1214
|
+
def get_schema(self) -> List[FlowfileColumn]:
|
|
1215
|
+
if self.schema is None:
|
|
1216
|
+
if len(self._node_ids) > 0:
|
|
1217
|
+
self.schema = self._node_db[self._node_ids[0]].schema
|
|
1218
|
+
return self.schema
|
|
1219
|
+
|
|
1220
|
+
def get_example_data(self, node_id: int) -> TableExample | None:
|
|
1221
|
+
node = self._node_db[node_id]
|
|
1222
|
+
return node.get_table_example(include_data=True)
|
|
1223
|
+
|
|
1224
|
+
def get_node_data(self, node_id: int, include_example: bool = True) -> NodeData:
|
|
1225
|
+
node = self._node_db[node_id]
|
|
1226
|
+
return node.get_node_data(flow_id=self.flow_id, include_example=include_example)
|
|
1227
|
+
|
|
1228
|
+
def get_node_storage(self) -> schemas.FlowInformation:
|
|
1229
|
+
|
|
1230
|
+
node_information = {node.node_id: node.get_node_information() for
|
|
1231
|
+
node in self.nodes if node.is_setup and node.is_correct}
|
|
1232
|
+
|
|
1233
|
+
return schemas.FlowInformation(flow_id=self.flow_id,
|
|
1234
|
+
flow_name=self.__name__,
|
|
1235
|
+
storage_location=self.flow_settings.path,
|
|
1236
|
+
flow_settings=self.flow_settings,
|
|
1237
|
+
data=node_information,
|
|
1238
|
+
node_starts=[v.node_id for v in self._flow_starts],
|
|
1239
|
+
node_connections=self.node_connections
|
|
1240
|
+
)
|
|
1241
|
+
|
|
1242
|
+
def cancel(self):
|
|
1243
|
+
if not self.flow_settings.is_running:
|
|
1244
|
+
return
|
|
1245
|
+
self.flow_settings.is_canceled = True
|
|
1246
|
+
for node in self.nodes:
|
|
1247
|
+
node.cancel()
|
|
1248
|
+
|
|
1249
|
+
def close_flow(self):
|
|
1250
|
+
for node in self.nodes:
|
|
1251
|
+
node.remove_cache()
|
|
1252
|
+
|
|
1253
|
+
def save_flow(self, flow_path: str):
|
|
1254
|
+
with open(flow_path, 'wb') as f:
|
|
1255
|
+
pickle.dump(self.get_node_storage(), f)
|
|
1256
|
+
self.flow_settings.path = flow_path
|
|
1257
|
+
|
|
1258
|
+
def get_frontend_data(self):
|
|
1259
|
+
result = {
|
|
1260
|
+
'Home': {
|
|
1261
|
+
"data": {}
|
|
1262
|
+
}
|
|
1263
|
+
}
|
|
1264
|
+
flow_info: schemas.FlowInformation = self.get_node_storage()
|
|
1265
|
+
|
|
1266
|
+
for node_id, node_info in flow_info.data.items():
|
|
1267
|
+
if node_info.is_setup:
|
|
1268
|
+
try:
|
|
1269
|
+
pos_x = node_info.data.pos_x
|
|
1270
|
+
pos_y = node_info.data.pos_y
|
|
1271
|
+
# Basic node structure
|
|
1272
|
+
result["Home"]["data"][str(node_id)] = {
|
|
1273
|
+
"id": node_info.id,
|
|
1274
|
+
"name": node_info.type,
|
|
1275
|
+
"data": {}, # Additional data can go here
|
|
1276
|
+
"class": node_info.type,
|
|
1277
|
+
"html": node_info.type,
|
|
1278
|
+
"typenode": "vue",
|
|
1279
|
+
"inputs": {},
|
|
1280
|
+
"outputs": {},
|
|
1281
|
+
"pos_x": pos_x,
|
|
1282
|
+
"pos_y": pos_y
|
|
1283
|
+
}
|
|
1284
|
+
except Exception as e:
|
|
1285
|
+
logger.error(e)
|
|
1286
|
+
# Add outputs to the node based on `outputs` in your backend data
|
|
1287
|
+
if node_info.outputs:
|
|
1288
|
+
outputs = {o: 0 for o in node_info.outputs}
|
|
1289
|
+
for o in node_info.outputs:
|
|
1290
|
+
outputs[o] += 1
|
|
1291
|
+
connections = []
|
|
1292
|
+
for output_node_id, n_connections in outputs.items():
|
|
1293
|
+
leading_to_node = self.get_node(output_node_id)
|
|
1294
|
+
input_types = leading_to_node.get_input_type(node_info.id)
|
|
1295
|
+
for input_type in input_types:
|
|
1296
|
+
if input_type == 'main':
|
|
1297
|
+
input_frontend_id = 'input_1'
|
|
1298
|
+
elif input_type == 'right':
|
|
1299
|
+
input_frontend_id = 'input_2'
|
|
1300
|
+
elif input_type == 'left':
|
|
1301
|
+
input_frontend_id = 'input_3'
|
|
1302
|
+
else:
|
|
1303
|
+
input_frontend_id = 'input_1'
|
|
1304
|
+
connection = {"node": str(output_node_id), "input": input_frontend_id}
|
|
1305
|
+
connections.append(connection)
|
|
1306
|
+
|
|
1307
|
+
result["Home"]["data"][str(node_id)]["outputs"]["output_1"] = {
|
|
1308
|
+
"connections": connections}
|
|
1309
|
+
else:
|
|
1310
|
+
result["Home"]["data"][str(node_id)]["outputs"] = {"output_1": {"connections": []}}
|
|
1311
|
+
|
|
1312
|
+
# Add input to the node based on `depending_on_id` in your backend data
|
|
1313
|
+
if node_info.left_input_id is not None or node_info.right_input_id is not None or node_info.input_ids is not None:
|
|
1314
|
+
main_inputs = node_info.main_input_ids
|
|
1315
|
+
result["Home"]["data"][str(node_id)]["inputs"]["input_1"] = {
|
|
1316
|
+
"connections": [{"node": str(main_node_id), "input": "output_1"} for main_node_id in main_inputs]
|
|
1317
|
+
}
|
|
1318
|
+
if node_info.right_input_id is not None:
|
|
1319
|
+
result["Home"]["data"][str(node_id)]["inputs"]["input_2"] = {
|
|
1320
|
+
"connections": [{"node": str(node_info.right_input_id), "input": "output_1"}]
|
|
1321
|
+
}
|
|
1322
|
+
if node_info.left_input_id is not None:
|
|
1323
|
+
result["Home"]["data"][str(node_id)]["inputs"]["input_3"] = {
|
|
1324
|
+
"connections": [{"node": str(node_info.left_input_id), "input": "output_1"}]
|
|
1325
|
+
}
|
|
1326
|
+
return result
|
|
1327
|
+
|
|
1328
|
+
def get_vue_flow_input(self) -> schemas.VueFlowInput:
|
|
1329
|
+
edges: List[schemas.NodeEdge] = []
|
|
1330
|
+
nodes: List[schemas.NodeInput] = []
|
|
1331
|
+
for node in self.nodes:
|
|
1332
|
+
nodes.append(node.get_node_input())
|
|
1333
|
+
edges.extend(node.get_edge_input())
|
|
1334
|
+
return schemas.VueFlowInput(node_edges=edges, node_inputs=nodes)
|
|
1335
|
+
|
|
1336
|
+
def reset(self):
|
|
1337
|
+
for node in self.nodes:
|
|
1338
|
+
node.reset(True)
|
|
1339
|
+
|
|
1340
|
+
def copy_node(self, new_node_settings: input_schema.NodePromise, existing_setting_input: Any, node_type: str) -> None:
|
|
1341
|
+
"""Copy an existing node with potentially new settings."""
|
|
1342
|
+
self.add_node_promise(new_node_settings)
|
|
1343
|
+
|
|
1344
|
+
if isinstance(existing_setting_input, input_schema.NodePromise):
|
|
1345
|
+
return
|
|
1346
|
+
|
|
1347
|
+
combined_settings = combine_existing_settings_and_new_settings(
|
|
1348
|
+
existing_setting_input, new_node_settings
|
|
1349
|
+
)
|
|
1350
|
+
getattr(self, f"add_{node_type}")(combined_settings)
|
|
1351
|
+
|
|
1352
|
+
|
|
1353
|
+
def combine_existing_settings_and_new_settings(setting_input: Any, new_settings: input_schema.NodePromise) -> Any:
|
|
1354
|
+
"""Combine excopy_nodeisting settings with new settings from a NodePromise."""
|
|
1355
|
+
copied_setting_input = deepcopy(setting_input)
|
|
1356
|
+
|
|
1357
|
+
# Update only attributes that exist on new_settings
|
|
1358
|
+
fields_to_update = (
|
|
1359
|
+
"node_id",
|
|
1360
|
+
"pos_x",
|
|
1361
|
+
"pos_y",
|
|
1362
|
+
"description",
|
|
1363
|
+
"flow_id"
|
|
1364
|
+
)
|
|
1365
|
+
|
|
1366
|
+
for field in fields_to_update:
|
|
1367
|
+
if hasattr(new_settings, field) and getattr(new_settings, field) is not None:
|
|
1368
|
+
setattr(copied_setting_input, field, getattr(new_settings, field))
|
|
1369
|
+
|
|
1370
|
+
return copied_setting_input
|
|
1371
|
+
|
|
1372
|
+
|
|
1373
|
+
def add_connection(flow: FlowGraph, node_connection: input_schema.NodeConnection):
|
|
1374
|
+
logger.info('adding a connection')
|
|
1375
|
+
from_node = flow.get_node(node_connection.output_connection.node_id)
|
|
1376
|
+
to_node = flow.get_node(node_connection.input_connection.node_id)
|
|
1377
|
+
logger.info(f'from_node={from_node}, to_node={to_node}')
|
|
1378
|
+
if not (from_node and to_node):
|
|
1379
|
+
raise HTTPException(404, 'Not not available')
|
|
1380
|
+
else:
|
|
1381
|
+
to_node.add_node_connection(from_node, node_connection.input_connection.get_node_input_connection_type())
|
|
1382
|
+
|
|
1383
|
+
|
|
1384
|
+
def delete_connection(graph, node_connection: input_schema.NodeConnection):
|
|
1385
|
+
"""Delete the connection between two nodes."""
|
|
1386
|
+
from_node = graph.get_node(node_connection.output_connection.node_id)
|
|
1387
|
+
to_node = graph.get_node(node_connection.input_connection.node_id)
|
|
1388
|
+
connection_valid = to_node.node_inputs.validate_if_input_connection_exists(
|
|
1389
|
+
node_input_id=from_node.node_id,
|
|
1390
|
+
connection_name=node_connection.input_connection.get_node_input_connection_type(),
|
|
1391
|
+
)
|
|
1392
|
+
if not connection_valid:
|
|
1393
|
+
raise HTTPException(422, "Connection does not exist on the input node")
|
|
1394
|
+
if from_node is not None:
|
|
1395
|
+
from_node.delete_lead_to_node(node_connection.input_connection.node_id)
|
|
1396
|
+
|
|
1397
|
+
if to_node is not None:
|
|
1398
|
+
to_node.delete_input_node(
|
|
1399
|
+
node_connection.output_connection.node_id,
|
|
1400
|
+
connection_type=node_connection.input_connection.connection_class,
|
|
1401
|
+
)
|
|
1402
|
+
|
|
1403
|
+
|