Flowfile 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- build_backends/__init__.py +0 -0
- build_backends/main.py +313 -0
- build_backends/main_prd.py +202 -0
- flowfile/__init__.py +71 -0
- flowfile/__main__.py +24 -0
- flowfile-0.2.2.dist-info/LICENSE +21 -0
- flowfile-0.2.2.dist-info/METADATA +225 -0
- flowfile-0.2.2.dist-info/RECORD +171 -0
- flowfile-0.2.2.dist-info/WHEEL +4 -0
- flowfile-0.2.2.dist-info/entry_points.txt +9 -0
- flowfile_core/__init__.py +13 -0
- flowfile_core/auth/__init__.py +0 -0
- flowfile_core/auth/jwt.py +140 -0
- flowfile_core/auth/models.py +40 -0
- flowfile_core/auth/secrets.py +178 -0
- flowfile_core/configs/__init__.py +35 -0
- flowfile_core/configs/flow_logger.py +433 -0
- flowfile_core/configs/node_store/__init__.py +0 -0
- flowfile_core/configs/node_store/nodes.py +98 -0
- flowfile_core/configs/settings.py +120 -0
- flowfile_core/database/__init__.py +0 -0
- flowfile_core/database/connection.py +51 -0
- flowfile_core/database/init_db.py +45 -0
- flowfile_core/database/models.py +41 -0
- flowfile_core/fileExplorer/__init__.py +0 -0
- flowfile_core/fileExplorer/funcs.py +259 -0
- flowfile_core/fileExplorer/utils.py +53 -0
- flowfile_core/flowfile/FlowfileFlow.py +1403 -0
- flowfile_core/flowfile/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
- flowfile_core/flowfile/analytics/__init__.py +0 -0
- flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
- flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
- flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
- flowfile_core/flowfile/analytics/utils.py +9 -0
- flowfile_core/flowfile/connection_manager/__init__.py +3 -0
- flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
- flowfile_core/flowfile/connection_manager/models.py +10 -0
- flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
- flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
- flowfile_core/flowfile/database_connection_manager/models.py +15 -0
- flowfile_core/flowfile/extensions.py +36 -0
- flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
- flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
- flowfile_core/flowfile/flow_data_engine/types.py +0 -0
- flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
- flowfile_core/flowfile/flow_node/__init__.py +0 -0
- flowfile_core/flowfile/flow_node/flow_node.py +771 -0
- flowfile_core/flowfile/flow_node/models.py +111 -0
- flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
- flowfile_core/flowfile/handler.py +123 -0
- flowfile_core/flowfile/manage/__init__.py +0 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
- flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
- flowfile_core/flowfile/manage/open_flowfile.py +136 -0
- flowfile_core/flowfile/setting_generator/__init__.py +2 -0
- flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
- flowfile_core/flowfile/setting_generator/settings.py +176 -0
- flowfile_core/flowfile/sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
- flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
- flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
- flowfile_core/flowfile/util/__init__.py +0 -0
- flowfile_core/flowfile/util/calculate_layout.py +137 -0
- flowfile_core/flowfile/util/execution_orderer.py +141 -0
- flowfile_core/flowfile/utils.py +106 -0
- flowfile_core/main.py +138 -0
- flowfile_core/routes/__init__.py +0 -0
- flowfile_core/routes/auth.py +34 -0
- flowfile_core/routes/logs.py +163 -0
- flowfile_core/routes/public.py +10 -0
- flowfile_core/routes/routes.py +601 -0
- flowfile_core/routes/secrets.py +85 -0
- flowfile_core/run_lock.py +11 -0
- flowfile_core/schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
- flowfile_core/schemas/defaults.py +9 -0
- flowfile_core/schemas/external_sources/__init__.py +0 -0
- flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
- flowfile_core/schemas/input_schema.py +477 -0
- flowfile_core/schemas/models.py +193 -0
- flowfile_core/schemas/output_model.py +115 -0
- flowfile_core/schemas/schemas.py +106 -0
- flowfile_core/schemas/transform_schema.py +569 -0
- flowfile_core/secrets/__init__.py +0 -0
- flowfile_core/secrets/secrets.py +64 -0
- flowfile_core/utils/__init__.py +0 -0
- flowfile_core/utils/arrow_reader.py +247 -0
- flowfile_core/utils/excel_file_manager.py +18 -0
- flowfile_core/utils/fileManager.py +45 -0
- flowfile_core/utils/fl_executor.py +38 -0
- flowfile_core/utils/utils.py +8 -0
- flowfile_frame/__init__.py +56 -0
- flowfile_frame/__main__.py +12 -0
- flowfile_frame/adapters.py +17 -0
- flowfile_frame/expr.py +1163 -0
- flowfile_frame/flow_frame.py +2093 -0
- flowfile_frame/group_frame.py +199 -0
- flowfile_frame/join.py +75 -0
- flowfile_frame/selectors.py +242 -0
- flowfile_frame/utils.py +184 -0
- flowfile_worker/__init__.py +55 -0
- flowfile_worker/configs.py +95 -0
- flowfile_worker/create/__init__.py +37 -0
- flowfile_worker/create/funcs.py +146 -0
- flowfile_worker/create/models.py +86 -0
- flowfile_worker/create/pl_types.py +35 -0
- flowfile_worker/create/read_excel_tables.py +110 -0
- flowfile_worker/create/utils.py +84 -0
- flowfile_worker/external_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
- flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
- flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- flowfile_worker/external_sources/sql_source/__init__.py +0 -0
- flowfile_worker/external_sources/sql_source/main.py +56 -0
- flowfile_worker/external_sources/sql_source/models.py +72 -0
- flowfile_worker/flow_logger.py +58 -0
- flowfile_worker/funcs.py +327 -0
- flowfile_worker/main.py +108 -0
- flowfile_worker/models.py +95 -0
- flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
- flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
- flowfile_worker/polars_fuzzy_match/models.py +36 -0
- flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
- flowfile_worker/polars_fuzzy_match/process.py +86 -0
- flowfile_worker/polars_fuzzy_match/utils.py +50 -0
- flowfile_worker/process_manager.py +36 -0
- flowfile_worker/routes.py +440 -0
- flowfile_worker/secrets.py +148 -0
- flowfile_worker/spawner.py +187 -0
- flowfile_worker/utils.py +25 -0
- test_utils/__init__.py +3 -0
- test_utils/postgres/__init__.py +1 -0
- test_utils/postgres/commands.py +109 -0
- test_utils/postgres/fixtures.py +417 -0
|
@@ -0,0 +1,1521 @@
|
|
|
1
|
+
# Standard library imports
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from math import ceil
|
|
7
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
|
8
|
+
|
|
9
|
+
# Third-party imports
|
|
10
|
+
from loky import Future
|
|
11
|
+
import polars as pl
|
|
12
|
+
from polars.exceptions import PanicException
|
|
13
|
+
from polars_grouper import graph_solver
|
|
14
|
+
from polars_expr_transformer import simple_function_to_expr as to_expr
|
|
15
|
+
from pyarrow.parquet import ParquetFile
|
|
16
|
+
|
|
17
|
+
# Local imports - Core
|
|
18
|
+
from flowfile_core.configs import logger
|
|
19
|
+
from flowfile_core.configs.flow_logger import NodeLogger
|
|
20
|
+
from flowfile_core.schemas import (
|
|
21
|
+
input_schema,
|
|
22
|
+
transform_schema as transform_schemas
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Local imports - Flow File Components
|
|
26
|
+
from flowfile_core.flowfile.flow_data_engine import utils
|
|
27
|
+
from flowfile_core.flowfile.flow_data_engine.create import funcs as create_funcs
|
|
28
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
|
|
29
|
+
FlowfileColumn,
|
|
30
|
+
convert_stats_to_column_info
|
|
31
|
+
)
|
|
32
|
+
from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars
|
|
33
|
+
from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
|
|
34
|
+
from flowfile_core.flowfile.flow_data_engine.join import (
|
|
35
|
+
verify_join_select_integrity,
|
|
36
|
+
verify_join_map_integrity
|
|
37
|
+
)
|
|
38
|
+
from flowfile_core.flowfile.flow_data_engine.polars_code_parser import polars_code_parser
|
|
39
|
+
from flowfile_core.flowfile.flow_data_engine.sample_data import create_fake_data
|
|
40
|
+
from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_operations import (
|
|
41
|
+
ExternalCreateFetcher,
|
|
42
|
+
ExternalDfFetcher,
|
|
43
|
+
ExternalExecutorTracker,
|
|
44
|
+
ExternalFuzzyMatchFetcher,
|
|
45
|
+
fetch_unique_values
|
|
46
|
+
)
|
|
47
|
+
from flowfile_core.flowfile.flow_data_engine.threaded_processes import (
|
|
48
|
+
get_join_count,
|
|
49
|
+
write_threaded
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class FlowDataEngine:
|
|
57
|
+
"""
|
|
58
|
+
A class that provides a unified interface for working with tabular data, supporting both eager and lazy evaluation.
|
|
59
|
+
|
|
60
|
+
The class is organized into several logical sections:
|
|
61
|
+
1. Core properties and initialization
|
|
62
|
+
2. Data access and manipulation
|
|
63
|
+
3. Schema and metadata operations
|
|
64
|
+
4. Transformations and operations
|
|
65
|
+
5. I/O operations
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
# Core attributes
|
|
69
|
+
_data_frame: Union[pl.DataFrame, pl.LazyFrame]
|
|
70
|
+
columns: List[Any]
|
|
71
|
+
|
|
72
|
+
# Metadata attributes
|
|
73
|
+
name: str = None
|
|
74
|
+
number_of_records: int = None
|
|
75
|
+
errors: List = None
|
|
76
|
+
_schema: Optional[List['FlowfileColumn']] = None
|
|
77
|
+
|
|
78
|
+
# Configuration attributes
|
|
79
|
+
_optimize_memory: bool = False
|
|
80
|
+
_lazy: bool = None
|
|
81
|
+
_streamable: bool = True
|
|
82
|
+
_calculate_schema_stats: bool = False
|
|
83
|
+
|
|
84
|
+
# Cache and optimization attributes
|
|
85
|
+
__col_name_idx_map: Dict = None
|
|
86
|
+
__data_map: Dict = None
|
|
87
|
+
__optimized_columns: List = None
|
|
88
|
+
__sample__: str = None
|
|
89
|
+
__number_of_fields: int = None
|
|
90
|
+
_col_idx: Dict[str, int] = None
|
|
91
|
+
|
|
92
|
+
# Source tracking
|
|
93
|
+
_org_path: Optional[str] = None
|
|
94
|
+
_external_source: Optional[ExternalDataSource] = None
|
|
95
|
+
|
|
96
|
+
# State tracking
|
|
97
|
+
sorted_by: int = None
|
|
98
|
+
is_future: bool = False
|
|
99
|
+
is_collected: bool = True
|
|
100
|
+
ind_schema_calculated: bool = False
|
|
101
|
+
|
|
102
|
+
# Callbacks
|
|
103
|
+
_future: Future = None
|
|
104
|
+
_number_of_records_callback: Callable = None
|
|
105
|
+
_data_callback: Callable = None
|
|
106
|
+
|
|
107
|
+
# Tracking info
|
|
108
|
+
# node_id: int = None # TODO: Implement node_id
|
|
109
|
+
# flow_id: int = None # TODO: Implement flow_id
|
|
110
|
+
|
|
111
|
+
def __init__(self,
|
|
112
|
+
raw_data: Union[List[Dict], List[Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame] = None,
|
|
113
|
+
path_ref: str = None,
|
|
114
|
+
name: str = None,
|
|
115
|
+
optimize_memory: bool = True,
|
|
116
|
+
schema: List['FlowfileColumn'] | List[str] | pl.Schema = None,
|
|
117
|
+
number_of_records: int = None,
|
|
118
|
+
calculate_schema_stats: bool = False,
|
|
119
|
+
streamable: bool = True,
|
|
120
|
+
number_of_records_callback: Callable = None,
|
|
121
|
+
data_callback: Callable = None):
|
|
122
|
+
"""Initialize FlowDataEngine with various data sources and configuration options."""
|
|
123
|
+
self._initialize_attributes(number_of_records_callback, data_callback, streamable)
|
|
124
|
+
|
|
125
|
+
if raw_data is not None:
|
|
126
|
+
self._handle_raw_data(raw_data, number_of_records, optimize_memory)
|
|
127
|
+
elif path_ref:
|
|
128
|
+
self._handle_path_ref(path_ref, optimize_memory)
|
|
129
|
+
else:
|
|
130
|
+
self.initialize_empty_fl()
|
|
131
|
+
|
|
132
|
+
self._finalize_initialization(name, optimize_memory, schema, calculate_schema_stats)
|
|
133
|
+
|
|
134
|
+
def _initialize_attributes(self, number_of_records_callback, data_callback, streamable):
|
|
135
|
+
"""Initialize basic attributes with default values."""
|
|
136
|
+
self._external_source = None
|
|
137
|
+
self._number_of_records_callback = number_of_records_callback
|
|
138
|
+
self._data_callback = data_callback
|
|
139
|
+
self.ind_schema_calculated = False
|
|
140
|
+
self._streamable = streamable
|
|
141
|
+
self._org_path = None
|
|
142
|
+
self._lazy = False
|
|
143
|
+
self.errors = []
|
|
144
|
+
self._calculate_schema_stats = False
|
|
145
|
+
self.is_collected = True
|
|
146
|
+
self.is_future = False
|
|
147
|
+
|
|
148
|
+
def _handle_raw_data(self, raw_data, number_of_records, optimize_memory):
|
|
149
|
+
"""Process different types of input data."""
|
|
150
|
+
if isinstance(raw_data, pl.DataFrame):
|
|
151
|
+
self._handle_polars_dataframe(raw_data, number_of_records)
|
|
152
|
+
elif isinstance(raw_data, pl.LazyFrame):
|
|
153
|
+
self._handle_polars_lazy_frame(raw_data, number_of_records, optimize_memory)
|
|
154
|
+
elif isinstance(raw_data, (list, dict)):
|
|
155
|
+
self._handle_python_data(raw_data)
|
|
156
|
+
|
|
157
|
+
def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: Optional[int]):
|
|
158
|
+
"""Handle Polars DataFrame input."""
|
|
159
|
+
self.data_frame = df
|
|
160
|
+
self.number_of_records = number_of_records or df.select(pl.len())[0, 0]
|
|
161
|
+
|
|
162
|
+
def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: Optional[int], optimize_memory: bool):
|
|
163
|
+
"""Handle Polars LazyFrame input."""
|
|
164
|
+
self.data_frame = lf
|
|
165
|
+
self._lazy = True
|
|
166
|
+
if number_of_records is not None:
|
|
167
|
+
self.number_of_records = number_of_records
|
|
168
|
+
elif optimize_memory:
|
|
169
|
+
self.number_of_records = -1
|
|
170
|
+
else:
|
|
171
|
+
self.number_of_records = lf.select(pl.len()).collect()[0, 0]
|
|
172
|
+
|
|
173
|
+
def _handle_python_data(self, data: Union[List, Dict]):
|
|
174
|
+
"""Handle Python list or dict input."""
|
|
175
|
+
if isinstance(data, dict):
|
|
176
|
+
self._handle_dict_input(data)
|
|
177
|
+
else:
|
|
178
|
+
self._handle_list_input(data)
|
|
179
|
+
|
|
180
|
+
def _handle_dict_input(self, data: Dict):
|
|
181
|
+
"""Handle dictionary input."""
|
|
182
|
+
if len(data) == 0:
|
|
183
|
+
self.initialize_empty_fl()
|
|
184
|
+
lengths = [len(v) if isinstance(v, (list, tuple)) else 1 for v in data.values()]
|
|
185
|
+
|
|
186
|
+
if len(set(lengths)) == 1 and lengths[0]>1:
|
|
187
|
+
self.number_of_records = lengths[0]
|
|
188
|
+
self.data_frame = pl.DataFrame(data)
|
|
189
|
+
else:
|
|
190
|
+
self.number_of_records = 1
|
|
191
|
+
self.data_frame = pl.DataFrame([data])
|
|
192
|
+
|
|
193
|
+
def _handle_list_input(self, data: List):
|
|
194
|
+
"""Handle list input."""
|
|
195
|
+
number_of_records = len(data)
|
|
196
|
+
if number_of_records > 0:
|
|
197
|
+
processed_data = self._process_list_data(data)
|
|
198
|
+
self.number_of_records = number_of_records
|
|
199
|
+
self.data_frame = pl.DataFrame(processed_data)
|
|
200
|
+
self.lazy = True
|
|
201
|
+
else:
|
|
202
|
+
self.initialize_empty_fl()
|
|
203
|
+
self.number_of_records = 0
|
|
204
|
+
|
|
205
|
+
@staticmethod
|
|
206
|
+
def _process_list_data(data: List) -> List[Dict]:
|
|
207
|
+
"""Process list data into a format suitable for DataFrame creation."""
|
|
208
|
+
if not (isinstance(data[0], dict) or hasattr(data[0], '__dict__')):
|
|
209
|
+
try:
|
|
210
|
+
return pl.DataFrame(data).to_dicts()
|
|
211
|
+
except:
|
|
212
|
+
raise Exception('Value must be able to be converted to dictionary')
|
|
213
|
+
|
|
214
|
+
if not isinstance(data[0], dict):
|
|
215
|
+
data = [row.__dict__ for row in data]
|
|
216
|
+
|
|
217
|
+
return utils.ensure_similarity_dicts(data)
|
|
218
|
+
|
|
219
|
+
def _handle_path_ref(self, path_ref: str, optimize_memory: bool):
|
|
220
|
+
"""Handle file path reference input."""
|
|
221
|
+
try:
|
|
222
|
+
pf = ParquetFile(path_ref)
|
|
223
|
+
except Exception as e:
|
|
224
|
+
logger.error(e)
|
|
225
|
+
raise Exception("Provided ref is not a parquet file")
|
|
226
|
+
|
|
227
|
+
self.number_of_records = pf.metadata.num_rows
|
|
228
|
+
if optimize_memory:
|
|
229
|
+
self._lazy = True
|
|
230
|
+
self.data_frame = pl.scan_parquet(path_ref)
|
|
231
|
+
else:
|
|
232
|
+
self.data_frame = pl.read_parquet(path_ref)
|
|
233
|
+
|
|
234
|
+
def _finalize_initialization(self, name: str, optimize_memory: bool, schema: Optional[Any],
|
|
235
|
+
calculate_schema_stats: bool):
|
|
236
|
+
"""Finalize initialization by setting remaining attributes."""
|
|
237
|
+
_ = calculate_schema_stats
|
|
238
|
+
self.name = name
|
|
239
|
+
self._optimize_memory = optimize_memory
|
|
240
|
+
pl_schema = self.data_frame.collect_schema()
|
|
241
|
+
self._schema = self._handle_schema(schema, pl_schema)
|
|
242
|
+
self.columns = [c.column_name for c in self._schema] if self._schema else pl_schema.names()
|
|
243
|
+
|
|
244
|
+
def __getitem__(self, item):
|
|
245
|
+
"""Access a specific column or item from the DataFrame."""
|
|
246
|
+
return self.data_frame.select([item])
|
|
247
|
+
|
|
248
|
+
@property
|
|
249
|
+
def data_frame(self) -> pl.LazyFrame | pl.DataFrame:
|
|
250
|
+
"""Get the underlying DataFrame with appropriate handling of different states."""
|
|
251
|
+
if self._data_frame is not None and not self.is_future:
|
|
252
|
+
return self._data_frame
|
|
253
|
+
elif self.is_future:
|
|
254
|
+
return self._data_frame
|
|
255
|
+
elif self._external_source is not None and self.lazy:
|
|
256
|
+
return self._data_frame
|
|
257
|
+
elif self._external_source is not None and not self.lazy:
|
|
258
|
+
if self._external_source.get_pl_df() is None:
|
|
259
|
+
data_frame = list(self._external_source.get_iter())
|
|
260
|
+
if len(data_frame) > 0:
|
|
261
|
+
self.data_frame = pl.DataFrame(data_frame)
|
|
262
|
+
else:
|
|
263
|
+
self.data_frame = self._external_source.get_pl_df()
|
|
264
|
+
self.calculate_schema()
|
|
265
|
+
return self._data_frame
|
|
266
|
+
|
|
267
|
+
@data_frame.setter
|
|
268
|
+
def data_frame(self, df: pl.LazyFrame | pl.DataFrame):
|
|
269
|
+
"""Set the underlying DataFrame with validation."""
|
|
270
|
+
if self.lazy and isinstance(df, pl.DataFrame):
|
|
271
|
+
raise Exception('Cannot set a non-lazy dataframe to a lazy flowfile')
|
|
272
|
+
self._data_frame = df
|
|
273
|
+
|
|
274
|
+
@property
|
|
275
|
+
def schema(self) -> List[FlowfileColumn]:
|
|
276
|
+
"""Get the schema of the DataFrame, calculating if necessary."""
|
|
277
|
+
if self.number_of_fields == 0:
|
|
278
|
+
return []
|
|
279
|
+
if self._schema is None or (self._calculate_schema_stats and not self.ind_schema_calculated):
|
|
280
|
+
if self._calculate_schema_stats and not self.ind_schema_calculated:
|
|
281
|
+
schema_stats = self._calculate_schema()
|
|
282
|
+
self.ind_schema_calculated = True
|
|
283
|
+
else:
|
|
284
|
+
schema_stats = [
|
|
285
|
+
dict(column_name=k, pl_datatype=v, col_index=i)
|
|
286
|
+
for i, (k, v) in enumerate(self.data_frame.collect_schema().items())
|
|
287
|
+
]
|
|
288
|
+
self._schema = convert_stats_to_column_info(schema_stats)
|
|
289
|
+
return self._schema
|
|
290
|
+
|
|
291
|
+
@property
|
|
292
|
+
def number_of_fields(self) -> int:
|
|
293
|
+
"""Get the number of fields in the DataFrame."""
|
|
294
|
+
if self.__number_of_fields is None:
|
|
295
|
+
self.__number_of_fields = len(self.columns)
|
|
296
|
+
return self.__number_of_fields
|
|
297
|
+
|
|
298
|
+
# Data Collection and Sampling Methods
|
|
299
|
+
|
|
300
|
+
def collect(self, n_records: int = None) -> pl.DataFrame:
|
|
301
|
+
"""
|
|
302
|
+
Collect data from the DataFrame, optionally limiting the number of records.
|
|
303
|
+
Handles streaming and error cases appropriately.
|
|
304
|
+
"""
|
|
305
|
+
if n_records is None:
|
|
306
|
+
logger.info(f'Fetching all data for Table object "{id(self)}". Settings: streaming={self._streamable}')
|
|
307
|
+
else:
|
|
308
|
+
logger.info(f'Fetching {n_records} record(s) for Table object "{id(self)}". '
|
|
309
|
+
f'Settings: streaming={self._streamable}')
|
|
310
|
+
|
|
311
|
+
if not self.lazy:
|
|
312
|
+
return self.data_frame
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
return self._collect_data(n_records)
|
|
316
|
+
except Exception as e:
|
|
317
|
+
self.errors = [e]
|
|
318
|
+
return self._handle_collection_error(n_records)
|
|
319
|
+
|
|
320
|
+
def _collect_data(self, n_records: int = None) -> pl.DataFrame:
|
|
321
|
+
"""Internal method to handle data collection."""
|
|
322
|
+
if n_records is None:
|
|
323
|
+
self.collect_external()
|
|
324
|
+
if self._streamable:
|
|
325
|
+
try:
|
|
326
|
+
logger.info('Collecting data in streaming mode')
|
|
327
|
+
return self.data_frame.collect(engine="streaming")
|
|
328
|
+
except PanicException:
|
|
329
|
+
self._streamable = False
|
|
330
|
+
|
|
331
|
+
logger.info('Collecting data in non-streaming mode')
|
|
332
|
+
return self.data_frame.collect()
|
|
333
|
+
|
|
334
|
+
if self.external_source is not None:
|
|
335
|
+
return self._collect_from_external_source(n_records)
|
|
336
|
+
|
|
337
|
+
if self._streamable:
|
|
338
|
+
return self.data_frame.head(n_records).collect(engine="streaming", comm_subplan_elim=False)
|
|
339
|
+
return self.data_frame.head(n_records).collect()
|
|
340
|
+
|
|
341
|
+
def _collect_from_external_source(self, n_records: int) -> pl.DataFrame:
|
|
342
|
+
"""Handle collection from external source."""
|
|
343
|
+
if self.external_source.get_pl_df() is not None:
|
|
344
|
+
all_data = self.external_source.get_pl_df().head(n_records)
|
|
345
|
+
self.data_frame = all_data
|
|
346
|
+
else:
|
|
347
|
+
all_data = self.external_source.get_sample(n_records)
|
|
348
|
+
self.data_frame = pl.LazyFrame(all_data)
|
|
349
|
+
return self.data_frame
|
|
350
|
+
|
|
351
|
+
def _handle_collection_error(self, n_records: int) -> pl.DataFrame:
|
|
352
|
+
"""Handle errors during collection by attempting partial collection."""
|
|
353
|
+
n_records = 100000000 if n_records is None else n_records
|
|
354
|
+
ok_cols, error_cols = self._identify_valid_columns(n_records)
|
|
355
|
+
|
|
356
|
+
if len(ok_cols) > 0:
|
|
357
|
+
return self._create_partial_dataframe(ok_cols, error_cols, n_records)
|
|
358
|
+
return self._create_empty_dataframe(n_records)
|
|
359
|
+
|
|
360
|
+
def _identify_valid_columns(self, n_records: int) -> Tuple[List[str], List[Tuple[str, Any]]]:
|
|
361
|
+
"""Identify which columns can be collected successfully."""
|
|
362
|
+
ok_cols = []
|
|
363
|
+
error_cols = []
|
|
364
|
+
for c in self.columns:
|
|
365
|
+
try:
|
|
366
|
+
_ = self.data_frame.select(c).head(n_records).collect()
|
|
367
|
+
ok_cols.append(c)
|
|
368
|
+
except:
|
|
369
|
+
error_cols.append((c, self.data_frame.schema[c]))
|
|
370
|
+
return ok_cols, error_cols
|
|
371
|
+
|
|
372
|
+
def _create_partial_dataframe(self, ok_cols: List[str], error_cols: List[Tuple[str, Any]],
|
|
373
|
+
n_records: int) -> pl.DataFrame:
|
|
374
|
+
"""Create a DataFrame with partial data for columns that could be collected."""
|
|
375
|
+
df = self.data_frame.select(ok_cols)
|
|
376
|
+
df = df.with_columns([
|
|
377
|
+
pl.lit(None).alias(column_name).cast(data_type)
|
|
378
|
+
for column_name, data_type in error_cols
|
|
379
|
+
])
|
|
380
|
+
return df.select(self.columns).head(n_records).collect()
|
|
381
|
+
|
|
382
|
+
def _create_empty_dataframe(self, n_records: int) -> pl.DataFrame:
|
|
383
|
+
"""Create an empty DataFrame with the correct schema."""
|
|
384
|
+
if self.number_of_records > 0:
|
|
385
|
+
return pl.DataFrame({
|
|
386
|
+
column_name: pl.Series(
|
|
387
|
+
name=column_name,
|
|
388
|
+
values=[None] * min(self.number_of_records, n_records)
|
|
389
|
+
).cast(data_type)
|
|
390
|
+
for column_name, data_type in self.data_frame.schema.items()
|
|
391
|
+
})
|
|
392
|
+
return pl.DataFrame(schema=self.data_frame.schema)
|
|
393
|
+
|
|
394
|
+
# Data Transformation Methods
|
|
395
|
+
|
|
396
|
+
def do_group_by(self, group_by_input: transform_schemas.GroupByInput,
|
|
397
|
+
calculate_schema_stats: bool = True) -> "FlowDataEngine":
|
|
398
|
+
"""Perform group by operations on the DataFrame."""
|
|
399
|
+
aggregations = [c for c in group_by_input.agg_cols if c.agg != 'groupby']
|
|
400
|
+
group_columns = [c for c in group_by_input.agg_cols if c.agg == 'groupby']
|
|
401
|
+
|
|
402
|
+
if len(group_columns) == 0:
|
|
403
|
+
return FlowDataEngine(
|
|
404
|
+
self.data_frame.select(
|
|
405
|
+
ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations
|
|
406
|
+
),
|
|
407
|
+
calculate_schema_stats=calculate_schema_stats
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
df = self.data_frame.rename({c.old_name: c.new_name for c in group_columns})
|
|
411
|
+
group_by_columns = [n_c.new_name for n_c in group_columns]
|
|
412
|
+
return FlowDataEngine(
|
|
413
|
+
df.group_by(*group_by_columns).agg(
|
|
414
|
+
ac.agg_func(ac.old_name).alias(ac.new_name) for ac in aggregations
|
|
415
|
+
),
|
|
416
|
+
calculate_schema_stats=calculate_schema_stats
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
def do_sort(self, sorts: List[transform_schemas.SortByInput]) -> "FlowDataEngine":
|
|
420
|
+
"""Sort the DataFrame based on specified columns and directions."""
|
|
421
|
+
if not sorts:
|
|
422
|
+
return self
|
|
423
|
+
|
|
424
|
+
descending = [s.how == 'desc' or s.how.lower() == 'descending' for s in sorts]
|
|
425
|
+
df = self.data_frame.sort([sort_by.column for sort_by in sorts], descending=descending)
|
|
426
|
+
return FlowDataEngine(df, number_of_records=self.number_of_records, schema=self.schema)
|
|
427
|
+
|
|
428
|
+
def change_column_types(self, transforms: List[transform_schemas.SelectInput],
|
|
429
|
+
calculate_schema: bool = False) -> "FlowDataEngine":
|
|
430
|
+
"""Change the data types of specified columns."""
|
|
431
|
+
dtypes = [dtype.base_type() for dtype in self.data_frame.collect_schema().dtypes()]
|
|
432
|
+
idx_mapping = list(
|
|
433
|
+
(transform.old_name, self.cols_idx.get(transform.old_name), getattr(pl, transform.polars_type))
|
|
434
|
+
for transform in transforms if transform.data_type is not None
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
actual_transforms = [c for c in idx_mapping if c[2] != dtypes[c[1]]]
|
|
438
|
+
transformations = [
|
|
439
|
+
utils.define_pl_col_transformation(col_name=transform[0], col_type=transform[2])
|
|
440
|
+
for transform in actual_transforms
|
|
441
|
+
]
|
|
442
|
+
|
|
443
|
+
df = self.data_frame.with_columns(transformations)
|
|
444
|
+
return FlowDataEngine(
|
|
445
|
+
df,
|
|
446
|
+
number_of_records=self.number_of_records,
|
|
447
|
+
calculate_schema_stats=calculate_schema,
|
|
448
|
+
streamable=self._streamable
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
# Data Export and Conversion Methods
|
|
452
|
+
|
|
453
|
+
def save(self, path: str, data_type: str = 'parquet') -> Future:
|
|
454
|
+
"""Save the DataFrame to a file."""
|
|
455
|
+
estimated_size = deepcopy(self.get_estimated_file_size() * 4)
|
|
456
|
+
df = deepcopy(self.data_frame)
|
|
457
|
+
return write_threaded(_df=df, path=path, data_type=data_type, estimated_size=estimated_size)
|
|
458
|
+
|
|
459
|
+
def to_pylist(self) -> List[Dict]:
|
|
460
|
+
"""Convert the DataFrame to a list of dictionaries."""
|
|
461
|
+
if self.lazy:
|
|
462
|
+
return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dicts()
|
|
463
|
+
return self.data_frame.to_dicts()
|
|
464
|
+
|
|
465
|
+
@classmethod
|
|
466
|
+
def create_from_external_source(cls, external_source: ExternalDataSource) -> "FlowDataEngine":
|
|
467
|
+
"""Create a FlowDataEngine from an external data source."""
|
|
468
|
+
if external_source.schema is not None:
|
|
469
|
+
ff = cls.create_from_schema(external_source.schema)
|
|
470
|
+
elif external_source.initial_data_getter is not None:
|
|
471
|
+
ff = cls(raw_data=external_source.initial_data_getter())
|
|
472
|
+
else:
|
|
473
|
+
ff = cls()
|
|
474
|
+
ff._external_source = external_source
|
|
475
|
+
return ff
|
|
476
|
+
|
|
477
|
+
@classmethod
|
|
478
|
+
def create_from_sql(cls, sql: str, conn: Any) -> "FlowDataEngine":
|
|
479
|
+
"""Create a FlowDataEngine from a SQL query."""
|
|
480
|
+
return cls(pl.read_sql(sql, conn))
|
|
481
|
+
|
|
482
|
+
@classmethod
|
|
483
|
+
def create_from_schema(cls, schema: List[FlowfileColumn]) -> "FlowDataEngine":
|
|
484
|
+
"""Create a FlowDataEngine from a schema definition."""
|
|
485
|
+
pl_schema = []
|
|
486
|
+
for i, flow_file_column in enumerate(schema):
|
|
487
|
+
pl_schema.append((flow_file_column.name, type_to_polars(flow_file_column.data_type)))
|
|
488
|
+
schema[i].col_index = i
|
|
489
|
+
df = pl.LazyFrame(schema=pl_schema)
|
|
490
|
+
return cls(df, schema=schema, calculate_schema_stats=False, number_of_records=0)
|
|
491
|
+
|
|
492
|
+
@classmethod
|
|
493
|
+
def create_from_path(cls, received_table: input_schema.ReceivedTableBase) -> "FlowDataEngine":
|
|
494
|
+
"""Create a FlowDataEngine from a file path."""
|
|
495
|
+
received_table.set_absolute_filepath()
|
|
496
|
+
|
|
497
|
+
file_type_handlers = {
|
|
498
|
+
'csv': create_funcs.create_from_path_csv,
|
|
499
|
+
'parquet': create_funcs.create_from_path_parquet,
|
|
500
|
+
'excel': create_funcs.create_from_path_excel
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
handler = file_type_handlers.get(received_table.file_type)
|
|
504
|
+
if not handler:
|
|
505
|
+
raise Exception(f'Cannot create from {received_table.file_type}')
|
|
506
|
+
|
|
507
|
+
flow_file = cls(handler(received_table))
|
|
508
|
+
flow_file._org_path = received_table.abs_file_path
|
|
509
|
+
return flow_file
|
|
510
|
+
|
|
511
|
+
@classmethod
|
|
512
|
+
def create_random(cls, number_of_records: int = 1000) -> "FlowDataEngine":
|
|
513
|
+
"""Create a FlowDataEngine with random data."""
|
|
514
|
+
return cls(create_fake_data(number_of_records))
|
|
515
|
+
|
|
516
|
+
@classmethod
|
|
517
|
+
def generate_enumerator(cls, length: int = 1000, output_name: str = 'output_column') -> "FlowDataEngine":
|
|
518
|
+
"""Generate a sequence of numbers as a FlowDataEngine."""
|
|
519
|
+
if length > 10_000_000:
|
|
520
|
+
length = 10_000_000
|
|
521
|
+
return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
|
|
522
|
+
|
|
523
|
+
# Schema Handling Methods
|
|
524
|
+
|
|
525
|
+
def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema,
|
|
526
|
+
pl_schema: pl.Schema) -> List[FlowfileColumn] | None:
|
|
527
|
+
"""Handle schema processing and validation."""
|
|
528
|
+
if schema is None:
|
|
529
|
+
return None
|
|
530
|
+
|
|
531
|
+
if schema.__len__() != pl_schema.__len__():
|
|
532
|
+
raise Exception(
|
|
533
|
+
f'Schema does not match the data got {schema.__len__()} columns expected {pl_schema.__len__()}')
|
|
534
|
+
|
|
535
|
+
if isinstance(schema, pl.Schema):
|
|
536
|
+
return self._handle_polars_schema(schema, pl_schema)
|
|
537
|
+
elif isinstance(schema, list) and len(schema) == 0:
|
|
538
|
+
return []
|
|
539
|
+
elif isinstance(schema[0], str):
|
|
540
|
+
return self._handle_string_schema(schema, pl_schema)
|
|
541
|
+
return schema
|
|
542
|
+
|
|
543
|
+
def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> List[FlowfileColumn]:
|
|
544
|
+
"""Handle Polars schema conversion."""
|
|
545
|
+
flow_file_columns = [
|
|
546
|
+
FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
|
|
547
|
+
for col_name, dtype in zip(schema.names(), schema.dtypes())
|
|
548
|
+
]
|
|
549
|
+
|
|
550
|
+
select_arg = [
|
|
551
|
+
pl.col(o).alias(n).cast(schema_dtype)
|
|
552
|
+
for o, n, schema_dtype in zip(pl_schema.names(), schema.names(), schema.dtypes())
|
|
553
|
+
]
|
|
554
|
+
|
|
555
|
+
self.data_frame = self.data_frame.select(select_arg)
|
|
556
|
+
return flow_file_columns
|
|
557
|
+
|
|
558
|
+
def _handle_string_schema(self, schema: List[str], pl_schema: pl.Schema) -> List[FlowfileColumn]:
|
|
559
|
+
"""Handle string-based schema conversion."""
|
|
560
|
+
flow_file_columns = [
|
|
561
|
+
FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
|
|
562
|
+
for col_name, dtype in zip(schema, pl_schema.dtypes())
|
|
563
|
+
]
|
|
564
|
+
|
|
565
|
+
self.data_frame = self.data_frame.rename({
|
|
566
|
+
o: n for o, n in zip(pl_schema.names(), schema)
|
|
567
|
+
})
|
|
568
|
+
|
|
569
|
+
return flow_file_columns
|
|
570
|
+
|
|
571
|
+
# Data Manipulation Methods
|
|
572
|
+
|
|
573
|
+
def split(self, split_input: transform_schemas.TextToRowsInput) -> "FlowDataEngine":
|
|
574
|
+
"""Split a column into multiple rows based on a delimiter."""
|
|
575
|
+
output_column_name = (
|
|
576
|
+
split_input.output_column_name
|
|
577
|
+
if split_input.output_column_name
|
|
578
|
+
else split_input.column_to_split
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
split_value = (
|
|
582
|
+
split_input.split_fixed_value
|
|
583
|
+
if split_input.split_by_fixed_value
|
|
584
|
+
else pl.col(split_input.split_by_column)
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
df = (
|
|
588
|
+
self.data_frame.with_columns(
|
|
589
|
+
pl.col(split_input.column_to_split)
|
|
590
|
+
.str.split(by=split_value)
|
|
591
|
+
.alias(output_column_name)
|
|
592
|
+
)
|
|
593
|
+
.explode(output_column_name)
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
return FlowDataEngine(df)
|
|
597
|
+
|
|
598
|
+
def unpivot(self, unpivot_input: transform_schemas.UnpivotInput) -> "FlowDataEngine":
|
|
599
|
+
"""Convert data from wide to long format."""
|
|
600
|
+
lf = self.data_frame
|
|
601
|
+
|
|
602
|
+
if unpivot_input.data_type_selector_expr is not None:
|
|
603
|
+
result = lf.unpivot(
|
|
604
|
+
on=unpivot_input.data_type_selector_expr(),
|
|
605
|
+
index=unpivot_input.index_columns
|
|
606
|
+
)
|
|
607
|
+
elif unpivot_input.value_columns is not None:
|
|
608
|
+
result = lf.unpivot(
|
|
609
|
+
on=unpivot_input.value_columns,
|
|
610
|
+
index=unpivot_input.index_columns
|
|
611
|
+
)
|
|
612
|
+
else:
|
|
613
|
+
result = lf.unpivot()
|
|
614
|
+
|
|
615
|
+
return FlowDataEngine(result)
|
|
616
|
+
|
|
617
|
+
def do_pivot(self, pivot_input: transform_schemas.PivotInput, node_logger: NodeLogger = None) -> "FlowDataEngine":
|
|
618
|
+
"""Convert data from long to wide format with aggregations."""
|
|
619
|
+
# Get unique values for pivot columns
|
|
620
|
+
max_unique_vals = 200
|
|
621
|
+
new_cols_unique = fetch_unique_values(self.data_frame.select(pivot_input.pivot_column)
|
|
622
|
+
.unique()
|
|
623
|
+
.sort(pivot_input.pivot_column)
|
|
624
|
+
.limit(max_unique_vals).cast(pl.String))
|
|
625
|
+
if len(new_cols_unique) >= max_unique_vals:
|
|
626
|
+
if node_logger:
|
|
627
|
+
node_logger.warning('Pivot column has too many unique values. Please consider using a different column.'
|
|
628
|
+
f' Max unique values: {max_unique_vals}')
|
|
629
|
+
|
|
630
|
+
if len(pivot_input.index_columns) == 0:
|
|
631
|
+
no_index_cols = True
|
|
632
|
+
pivot_input.index_columns = ['__temp__']
|
|
633
|
+
ff = self.apply_flowfile_formula('1', col_name='__temp__')
|
|
634
|
+
else:
|
|
635
|
+
no_index_cols = False
|
|
636
|
+
ff = self
|
|
637
|
+
|
|
638
|
+
# Perform pivot operations
|
|
639
|
+
index_columns = pivot_input.get_index_columns()
|
|
640
|
+
grouped_ff = ff.do_group_by(pivot_input.get_group_by_input(), False)
|
|
641
|
+
pivot_column = pivot_input.get_pivot_column()
|
|
642
|
+
|
|
643
|
+
input_df = grouped_ff.data_frame.with_columns(
|
|
644
|
+
pivot_column.cast(pl.String).alias(pivot_input.pivot_column)
|
|
645
|
+
)
|
|
646
|
+
number_of_aggregations = len(pivot_input.aggregations)
|
|
647
|
+
df = (
|
|
648
|
+
input_df.select(
|
|
649
|
+
*index_columns,
|
|
650
|
+
pivot_column,
|
|
651
|
+
pivot_input.get_values_expr()
|
|
652
|
+
)
|
|
653
|
+
.group_by(*index_columns)
|
|
654
|
+
.agg([
|
|
655
|
+
(pl.col('vals').filter(pivot_column == new_col_value))
|
|
656
|
+
.first()
|
|
657
|
+
.alias(new_col_value)
|
|
658
|
+
for new_col_value in new_cols_unique
|
|
659
|
+
])
|
|
660
|
+
.select(
|
|
661
|
+
*index_columns,
|
|
662
|
+
*[
|
|
663
|
+
pl.col(new_col).struct.field(agg).alias(f'{new_col + "_" + agg if number_of_aggregations > 1 else new_col }')
|
|
664
|
+
for new_col in new_cols_unique
|
|
665
|
+
for agg in pivot_input.aggregations
|
|
666
|
+
]
|
|
667
|
+
)
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
# Clean up temporary columns if needed
|
|
671
|
+
if no_index_cols:
|
|
672
|
+
df = df.drop('__temp__')
|
|
673
|
+
pivot_input.index_columns = []
|
|
674
|
+
|
|
675
|
+
return FlowDataEngine(df, calculate_schema_stats=False)
|
|
676
|
+
|
|
677
|
+
def do_filter(self, predicate: str) -> "FlowDataEngine":
|
|
678
|
+
"""Filter the DataFrame based on a predicate expression."""
|
|
679
|
+
try:
|
|
680
|
+
f = to_expr(predicate)
|
|
681
|
+
except Exception as e:
|
|
682
|
+
logger.warning(f'Error in filter expression: {e}')
|
|
683
|
+
f = to_expr("False")
|
|
684
|
+
df = self.data_frame.filter(f)
|
|
685
|
+
return FlowDataEngine(df, schema=self.schema, streamable=self._streamable)
|
|
686
|
+
|
|
687
|
+
def add_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
|
|
688
|
+
"""Add a record ID column with optional grouping."""
|
|
689
|
+
if record_id_settings.group_by and len(record_id_settings.group_by_columns) > 0:
|
|
690
|
+
return self._add_grouped_record_id(record_id_settings)
|
|
691
|
+
return self._add_simple_record_id(record_id_settings)
|
|
692
|
+
|
|
693
|
+
def _add_grouped_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
|
|
694
|
+
"""Add a record ID column with grouping."""
|
|
695
|
+
select_cols = [pl.col(record_id_settings.output_column_name)] + [pl.col(c) for c in self.columns]
|
|
696
|
+
|
|
697
|
+
df = (
|
|
698
|
+
self.data_frame
|
|
699
|
+
.with_columns(pl.lit(1).alias(record_id_settings.output_column_name))
|
|
700
|
+
.with_columns(
|
|
701
|
+
(pl.cum_count(record_id_settings.output_column_name)
|
|
702
|
+
.over(record_id_settings.group_by_columns) + record_id_settings.offset - 1)
|
|
703
|
+
.alias(record_id_settings.output_column_name)
|
|
704
|
+
)
|
|
705
|
+
.select(select_cols)
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, 'UInt64')]
|
|
709
|
+
output_schema.extend(self.schema)
|
|
710
|
+
|
|
711
|
+
return FlowDataEngine(df, schema=output_schema)
|
|
712
|
+
|
|
713
|
+
def _add_simple_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
|
|
714
|
+
"""Add a simple sequential record ID column."""
|
|
715
|
+
df = self.data_frame.with_row_index(
|
|
716
|
+
record_id_settings.output_column_name,
|
|
717
|
+
record_id_settings.offset
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
output_schema = [FlowfileColumn.from_input(record_id_settings.output_column_name, 'UInt64')]
|
|
721
|
+
output_schema.extend(self.schema)
|
|
722
|
+
|
|
723
|
+
return FlowDataEngine(df, schema=output_schema)
|
|
724
|
+
|
|
725
|
+
# Utility Methods
|
|
726
|
+
|
|
727
|
+
def get_schema_column(self, col_name: str) -> FlowfileColumn:
|
|
728
|
+
"""Get schema information for a specific column."""
|
|
729
|
+
for s in self.schema:
|
|
730
|
+
if s.name == col_name:
|
|
731
|
+
return s
|
|
732
|
+
|
|
733
|
+
def get_estimated_file_size(self) -> int:
|
|
734
|
+
"""Get the estimated size of the file in bytes."""
|
|
735
|
+
if self._org_path is not None:
|
|
736
|
+
return os.path.getsize(self._org_path)
|
|
737
|
+
return 0
|
|
738
|
+
|
|
739
|
+
def __repr__(self) -> str:
|
|
740
|
+
"""Return string representation of the FlowDataEngine."""
|
|
741
|
+
return f'flowfile table\n{self.data_frame.__repr__()}'
|
|
742
|
+
|
|
743
|
+
def __call__(self) -> "FlowDataEngine":
|
|
744
|
+
"""Make the class callable, returning self."""
|
|
745
|
+
return self
|
|
746
|
+
|
|
747
|
+
def __len__(self) -> int:
|
|
748
|
+
"""Get the number of records in the table."""
|
|
749
|
+
return self.number_of_records if self.number_of_records >= 0 else self.get_number_of_records()
|
|
750
|
+
|
|
751
|
+
def cache(self) -> "FlowDataEngine":
|
|
752
|
+
"""
|
|
753
|
+
Cache the data in background and update the DataFrame reference.
|
|
754
|
+
|
|
755
|
+
Returns:
|
|
756
|
+
FlowDataEngine: Self with cached data
|
|
757
|
+
"""
|
|
758
|
+
edf = ExternalDfFetcher(lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False,
|
|
759
|
+
flow_id=-1,
|
|
760
|
+
node_id=-1)
|
|
761
|
+
logger.info('Caching data in background')
|
|
762
|
+
result = edf.get_result()
|
|
763
|
+
if isinstance(result, pl.LazyFrame):
|
|
764
|
+
logger.info('Data cached')
|
|
765
|
+
del self._data_frame
|
|
766
|
+
self.data_frame = result
|
|
767
|
+
logger.info('Data loaded from cache')
|
|
768
|
+
return self
|
|
769
|
+
|
|
770
|
+
def collect_external(self):
|
|
771
|
+
"""Collect data from external source if present."""
|
|
772
|
+
if self._external_source is not None:
|
|
773
|
+
logger.info('Collecting external source')
|
|
774
|
+
if self.external_source.get_pl_df() is not None:
|
|
775
|
+
self.data_frame = self.external_source.get_pl_df().lazy()
|
|
776
|
+
else:
|
|
777
|
+
self.data_frame = pl.LazyFrame(list(self.external_source.get_iter()))
|
|
778
|
+
self._schema = None # enforce reset schema
|
|
779
|
+
|
|
780
|
+
# Data Access Methods
|
|
781
|
+
def get_output_sample(self, n_rows: int = 10) -> List[Dict]:
|
|
782
|
+
"""
|
|
783
|
+
Get a sample of the data as a list of dictionaries.
|
|
784
|
+
|
|
785
|
+
Args:
|
|
786
|
+
n_rows: Number of rows to sample
|
|
787
|
+
|
|
788
|
+
Returns:
|
|
789
|
+
List[Dict]: Sample data as dictionaries
|
|
790
|
+
"""
|
|
791
|
+
if self.number_of_records > n_rows or self.number_of_records < 0:
|
|
792
|
+
df = self.collect(n_rows)
|
|
793
|
+
else:
|
|
794
|
+
df = self.collect()
|
|
795
|
+
return df.to_dicts()
|
|
796
|
+
|
|
797
|
+
def __get_sample__(self, n_rows: int = 100, streamable: bool = True) -> "FlowDataEngine":
|
|
798
|
+
if not self.lazy:
|
|
799
|
+
df = self.data_frame.lazy()
|
|
800
|
+
else:
|
|
801
|
+
df = self.data_frame
|
|
802
|
+
|
|
803
|
+
if streamable:
|
|
804
|
+
try:
|
|
805
|
+
df = df.head(n_rows).collect()
|
|
806
|
+
except Exception as e:
|
|
807
|
+
logger.warning(f'Error in getting sample: {e}')
|
|
808
|
+
df = df.head(n_rows).collect(engine="auto")
|
|
809
|
+
else:
|
|
810
|
+
df = self.collect()
|
|
811
|
+
return FlowDataEngine(df, number_of_records=len(df), schema=self.schema)
|
|
812
|
+
|
|
813
|
+
def get_sample(self, n_rows: int = 100, random: bool = False, shuffle: bool = False,
|
|
814
|
+
seed: int = None) -> "FlowDataEngine":
|
|
815
|
+
"""
|
|
816
|
+
Get a sample of rows from the DataFrame.
|
|
817
|
+
|
|
818
|
+
Args:
|
|
819
|
+
n_rows: Number of rows to sample
|
|
820
|
+
random: Whether to randomly sample
|
|
821
|
+
shuffle: Whether to shuffle the sample
|
|
822
|
+
seed: Random seed for reproducibility
|
|
823
|
+
|
|
824
|
+
Returns:
|
|
825
|
+
FlowDataEngine: New instance with sampled data
|
|
826
|
+
"""
|
|
827
|
+
n_records = min(n_rows, self.number_of_records)
|
|
828
|
+
logging.info(f'Getting sample of {n_rows} rows')
|
|
829
|
+
|
|
830
|
+
if random:
|
|
831
|
+
if self.lazy and self.external_source is not None:
|
|
832
|
+
self.collect_external()
|
|
833
|
+
|
|
834
|
+
if self.lazy and shuffle:
|
|
835
|
+
sample_df = self.data_frame.collect(engine="streaming" if self._streamable else "auto").sample(n_rows,
|
|
836
|
+
seed=seed,
|
|
837
|
+
shuffle=shuffle)
|
|
838
|
+
elif shuffle:
|
|
839
|
+
sample_df = self.data_frame.sample(n_rows, seed=seed, shuffle=shuffle)
|
|
840
|
+
else:
|
|
841
|
+
every_n_records = ceil(self.number_of_records / n_rows)
|
|
842
|
+
sample_df = self.data_frame.gather_every(every_n_records)
|
|
843
|
+
else:
|
|
844
|
+
if self.external_source:
|
|
845
|
+
self.collect(n_rows)
|
|
846
|
+
sample_df = self.data_frame.head(n_rows)
|
|
847
|
+
|
|
848
|
+
return FlowDataEngine(sample_df, schema=self.schema, number_of_records=n_records)
|
|
849
|
+
|
|
850
|
+
def get_subset(self, n_rows: int = 100) -> "FlowDataEngine":
|
|
851
|
+
"""
|
|
852
|
+
Get a subset of rows from the DataFrame.
|
|
853
|
+
|
|
854
|
+
Args:
|
|
855
|
+
n_rows: Number of rows to include
|
|
856
|
+
|
|
857
|
+
Returns:
|
|
858
|
+
FlowDataEngine: New instance with subset of data
|
|
859
|
+
"""
|
|
860
|
+
if not self.lazy:
|
|
861
|
+
return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
|
|
862
|
+
else:
|
|
863
|
+
return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
|
|
864
|
+
|
|
865
|
+
# Iterator Methods
|
|
866
|
+
def iter_batches(self, batch_size: int = 1000, columns: Union[List, Tuple, str] = None):
|
|
867
|
+
"""
|
|
868
|
+
Iterate over the DataFrame in batches.
|
|
869
|
+
|
|
870
|
+
Args:
|
|
871
|
+
batch_size: Size of each batch
|
|
872
|
+
columns: Columns to include
|
|
873
|
+
|
|
874
|
+
Yields:
|
|
875
|
+
FlowDataEngine: New instance for each batch
|
|
876
|
+
"""
|
|
877
|
+
if columns:
|
|
878
|
+
self.data_frame = self.data_frame.select(columns)
|
|
879
|
+
self.lazy = False
|
|
880
|
+
batches = self.data_frame.iter_slices(batch_size)
|
|
881
|
+
for batch in batches:
|
|
882
|
+
yield FlowDataEngine(batch)
|
|
883
|
+
|
|
884
|
+
def start_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
885
|
+
other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
|
|
886
|
+
node_id: int | str = -1) -> ExternalFuzzyMatchFetcher:
|
|
887
|
+
"""
|
|
888
|
+
Starts a fuzzy join with another DataFrame and returns the object to track.
|
|
889
|
+
|
|
890
|
+
Args:
|
|
891
|
+
fuzzy_match_input: Fuzzy matching parameters
|
|
892
|
+
other: Right DataFrame for join
|
|
893
|
+
file_ref: Reference for temporary files
|
|
894
|
+
flow_id: Flow ID for tracking
|
|
895
|
+
node_id: Node ID for tracking
|
|
896
|
+
Returns:
|
|
897
|
+
FlowDataEngine: New instance with joined data
|
|
898
|
+
"""
|
|
899
|
+
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
|
|
900
|
+
fuzzy_match_input=fuzzy_match_input)
|
|
901
|
+
return ExternalFuzzyMatchFetcher(left_df, right_df,
|
|
902
|
+
fuzzy_maps=fuzzy_match_input.fuzzy_maps,
|
|
903
|
+
file_ref=file_ref + '_fm',
|
|
904
|
+
wait_on_completion=False,
|
|
905
|
+
flow_id=flow_id,
|
|
906
|
+
node_id=node_id)
|
|
907
|
+
|
|
908
|
+
def do_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
909
|
+
other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
|
|
910
|
+
node_id: int | str = -1) -> "FlowDataEngine":
|
|
911
|
+
"""
|
|
912
|
+
Perform a fuzzy join with another DataFrame.
|
|
913
|
+
|
|
914
|
+
Args:
|
|
915
|
+
fuzzy_match_input: Fuzzy matching parameters
|
|
916
|
+
other: Right DataFrame for join
|
|
917
|
+
file_ref: Reference for temporary files
|
|
918
|
+
flow_id: Flow ID for tracking
|
|
919
|
+
node_id: Node ID for tracking
|
|
920
|
+
Returns:
|
|
921
|
+
FlowDataEngine: New instance with joined data
|
|
922
|
+
"""
|
|
923
|
+
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
|
|
924
|
+
fuzzy_match_input=fuzzy_match_input)
|
|
925
|
+
f = ExternalFuzzyMatchFetcher(left_df, right_df,
|
|
926
|
+
fuzzy_maps=fuzzy_match_input.fuzzy_maps,
|
|
927
|
+
file_ref=file_ref + '_fm',
|
|
928
|
+
wait_on_completion=True,
|
|
929
|
+
flow_id=flow_id,
|
|
930
|
+
node_id=node_id)
|
|
931
|
+
return FlowDataEngine(f.get_result())
|
|
932
|
+
|
|
933
|
+
def fuzzy_match(self, right: "FlowDataEngine", left_on: str, right_on: str,
|
|
934
|
+
fuzzy_method: str = 'levenshtein', threshold: float = 0.75) -> "FlowDataEngine":
|
|
935
|
+
"""
|
|
936
|
+
Perform fuzzy matching between two DataFrames.
|
|
937
|
+
|
|
938
|
+
Args:
|
|
939
|
+
right: Right DataFrame for matching
|
|
940
|
+
left_on: Column from left DataFrame
|
|
941
|
+
right_on: Column from right DataFrame
|
|
942
|
+
fuzzy_method: Method for fuzzy matching
|
|
943
|
+
threshold: Matching threshold
|
|
944
|
+
|
|
945
|
+
Returns:
|
|
946
|
+
FlowDataEngine: New instance with matched data
|
|
947
|
+
"""
|
|
948
|
+
fuzzy_match_input = transform_schemas.FuzzyMatchInput(
|
|
949
|
+
[transform_schemas.FuzzyMap(
|
|
950
|
+
left_on, right_on,
|
|
951
|
+
fuzzy_type=fuzzy_method,
|
|
952
|
+
threshold_score=threshold
|
|
953
|
+
)],
|
|
954
|
+
left_select=self.columns,
|
|
955
|
+
right_select=right.columns
|
|
956
|
+
)
|
|
957
|
+
return self.do_fuzzy_join(fuzzy_match_input, right, str(id(self)))
|
|
958
|
+
|
|
959
|
+
def do_cross_join(self, cross_join_input: transform_schemas.CrossJoinInput,
|
|
960
|
+
auto_generate_selection: bool, verify_integrity: bool,
|
|
961
|
+
other: "FlowDataEngine") -> "FlowDataEngine":
|
|
962
|
+
"""
|
|
963
|
+
Perform a cross join with another DataFrame.
|
|
964
|
+
|
|
965
|
+
Args:
|
|
966
|
+
cross_join_input: Cross join parameters
|
|
967
|
+
auto_generate_selection: Whether to auto-generate column selection
|
|
968
|
+
verify_integrity: Whether to verify join integrity
|
|
969
|
+
other: Right DataFrame for join
|
|
970
|
+
|
|
971
|
+
Returns:
|
|
972
|
+
FlowDataEngine: New instance with joined data
|
|
973
|
+
|
|
974
|
+
Raises:
|
|
975
|
+
Exception: If join would result in too many records
|
|
976
|
+
"""
|
|
977
|
+
self.lazy = True
|
|
978
|
+
other.lazy = True
|
|
979
|
+
|
|
980
|
+
verify_join_select_integrity(cross_join_input, left_columns=self.columns, right_columns=other.columns)
|
|
981
|
+
|
|
982
|
+
# if auto_generate_selection:
|
|
983
|
+
# cross_join_input.auto_rename()
|
|
984
|
+
|
|
985
|
+
right_select = [v.old_name for v in cross_join_input.right_select.renames
|
|
986
|
+
if (v.keep or v.join_key) and v.is_available]
|
|
987
|
+
left_select = [v.old_name for v in cross_join_input.left_select.renames
|
|
988
|
+
if (v.keep or v.join_key) and v.is_available]
|
|
989
|
+
|
|
990
|
+
left = self.data_frame.select(left_select).rename(cross_join_input.left_select.rename_table)
|
|
991
|
+
right = other.data_frame.select(right_select).rename(cross_join_input.right_select.rename_table)
|
|
992
|
+
|
|
993
|
+
if verify_integrity:
|
|
994
|
+
n_records = self.get_number_of_records() * other.get_number_of_records()
|
|
995
|
+
if n_records > 1_000_000_000:
|
|
996
|
+
raise Exception("Join will result in too many records, ending process")
|
|
997
|
+
else:
|
|
998
|
+
n_records = -1
|
|
999
|
+
|
|
1000
|
+
joined_df = left.join(right, how='cross')
|
|
1001
|
+
|
|
1002
|
+
cols_to_delete_after = [col.new_name for col in
|
|
1003
|
+
cross_join_input.left_select.renames + cross_join_input.left_select.renames
|
|
1004
|
+
if col.join_key and not col.keep and col.is_available]
|
|
1005
|
+
|
|
1006
|
+
if verify_integrity:
|
|
1007
|
+
return FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False,
|
|
1008
|
+
number_of_records=n_records, streamable=False)
|
|
1009
|
+
else:
|
|
1010
|
+
fl = FlowDataEngine(joined_df.drop(cols_to_delete_after), calculate_schema_stats=False,
|
|
1011
|
+
number_of_records=0, streamable=False)
|
|
1012
|
+
return fl
|
|
1013
|
+
|
|
1014
|
+
def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
|
|
1015
|
+
verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
|
|
1016
|
+
"""
|
|
1017
|
+
Perform a join operation with another DataFrame.
|
|
1018
|
+
|
|
1019
|
+
Args:
|
|
1020
|
+
join_input: Join parameters
|
|
1021
|
+
auto_generate_selection: Whether to auto-generate column selection
|
|
1022
|
+
verify_integrity: Whether to verify join integrity
|
|
1023
|
+
other: Right DataFrame for join
|
|
1024
|
+
|
|
1025
|
+
Returns:
|
|
1026
|
+
FlowDataEngine: New instance with joined data
|
|
1027
|
+
|
|
1028
|
+
Raises:
|
|
1029
|
+
Exception: If join would result in too many records or is invalid
|
|
1030
|
+
"""
|
|
1031
|
+
# self.lazy = False if join_input.how == 'right' else True
|
|
1032
|
+
# other.lazy = False if join_input.how == 'right' else True
|
|
1033
|
+
|
|
1034
|
+
verify_join_select_integrity(join_input, left_columns=self.columns, right_columns=other.columns)
|
|
1035
|
+
if not verify_join_map_integrity(join_input, left_columns=self.schema, right_columns=other.schema):
|
|
1036
|
+
raise Exception('Join is not valid by the data fields')
|
|
1037
|
+
if auto_generate_selection:
|
|
1038
|
+
join_input.auto_rename()
|
|
1039
|
+
|
|
1040
|
+
right_select = [v.old_name for v in join_input.right_select.renames
|
|
1041
|
+
if (v.keep or v.join_key) and v.is_available]
|
|
1042
|
+
left_select = [v.old_name for v in join_input.left_select.renames
|
|
1043
|
+
if (v.keep or v.join_key) and v.is_available]
|
|
1044
|
+
left = self.data_frame.select(left_select).rename(join_input.left_select.rename_table)
|
|
1045
|
+
right = other.data_frame.select(right_select).rename(join_input.right_select.rename_table)
|
|
1046
|
+
|
|
1047
|
+
if verify_integrity and join_input.how != 'right':
|
|
1048
|
+
n_records = get_join_count(left, right, left_on_keys=join_input.left_join_keys,
|
|
1049
|
+
right_on_keys=join_input.right_join_keys, how=join_input.how)
|
|
1050
|
+
if n_records > 1_000_000_000:
|
|
1051
|
+
raise Exception("Join will result in too many records, ending process")
|
|
1052
|
+
else:
|
|
1053
|
+
n_records = -1
|
|
1054
|
+
if join_input.how == 'right':
|
|
1055
|
+
# Default to left join since right join can give panic issues in execution plan downstream
|
|
1056
|
+
joined_df = right.join(left, left_on=join_input.right_join_keys,
|
|
1057
|
+
right_on=join_input.left_join_keys, how="left", suffix="")
|
|
1058
|
+
else:
|
|
1059
|
+
joined_df = left.join(right, left_on=join_input.left_join_keys,
|
|
1060
|
+
right_on=join_input.right_join_keys,
|
|
1061
|
+
how=join_input.how, suffix="")
|
|
1062
|
+
cols_to_delete_after = [col.new_name for col in
|
|
1063
|
+
join_input.left_select.renames + join_input.left_select.renames
|
|
1064
|
+
if col.join_key and not col.keep and col.is_available]
|
|
1065
|
+
if len(cols_to_delete_after) > 0:
|
|
1066
|
+
joined_df = joined_df.drop(cols_to_delete_after)
|
|
1067
|
+
if verify_integrity:
|
|
1068
|
+
return FlowDataEngine(joined_df, calculate_schema_stats=True,
|
|
1069
|
+
number_of_records=n_records, streamable=False)
|
|
1070
|
+
else:
|
|
1071
|
+
fl = FlowDataEngine(joined_df, calculate_schema_stats=False,
|
|
1072
|
+
number_of_records=0, streamable=False)
|
|
1073
|
+
return fl
|
|
1074
|
+
|
|
1075
|
+
# Graph Operations
|
|
1076
|
+
def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
|
|
1077
|
+
"""
|
|
1078
|
+
Solve a graph problem using the specified columns.
|
|
1079
|
+
|
|
1080
|
+
Args:
|
|
1081
|
+
graph_solver_input: Graph solving parameters
|
|
1082
|
+
|
|
1083
|
+
Returns:
|
|
1084
|
+
FlowDataEngine: New instance with solved graph data
|
|
1085
|
+
"""
|
|
1086
|
+
lf = self.data_frame.with_columns(
|
|
1087
|
+
graph_solver(graph_solver_input.col_from, graph_solver_input.col_to)
|
|
1088
|
+
.alias(graph_solver_input.output_column_name)
|
|
1089
|
+
)
|
|
1090
|
+
return FlowDataEngine(lf)
|
|
1091
|
+
|
|
1092
|
+
# Data Modification Methods
|
|
1093
|
+
def add_new_values(self, values: Iterable, col_name: str = None) -> "FlowDataEngine":
|
|
1094
|
+
"""
|
|
1095
|
+
Add a new column with specified values.
|
|
1096
|
+
|
|
1097
|
+
Args:
|
|
1098
|
+
values: Values to add
|
|
1099
|
+
col_name: Name for new column
|
|
1100
|
+
|
|
1101
|
+
Returns:
|
|
1102
|
+
FlowDataEngine: New instance with added column
|
|
1103
|
+
"""
|
|
1104
|
+
if col_name is None:
|
|
1105
|
+
col_name = 'new_values'
|
|
1106
|
+
return FlowDataEngine(self.data_frame.with_columns(pl.Series(values).alias(col_name)))
|
|
1107
|
+
|
|
1108
|
+
def get_record_count(self) -> "FlowDataEngine":
|
|
1109
|
+
"""
|
|
1110
|
+
Get the total number of records.
|
|
1111
|
+
|
|
1112
|
+
Returns:
|
|
1113
|
+
FlowDataEngine: New instance with record count
|
|
1114
|
+
"""
|
|
1115
|
+
return FlowDataEngine(self.data_frame.select(pl.len().alias('number_of_records')))
|
|
1116
|
+
|
|
1117
|
+
def assert_equal(self, other: "FlowDataEngine", ordered: bool = True, strict_schema: bool = False):
|
|
1118
|
+
"""
|
|
1119
|
+
Assert that this DataFrame is equal to another.
|
|
1120
|
+
|
|
1121
|
+
Args:
|
|
1122
|
+
other: DataFrame to compare with
|
|
1123
|
+
ordered: Whether to consider row order
|
|
1124
|
+
strict_schema: Whether to strictly compare schemas
|
|
1125
|
+
|
|
1126
|
+
Raises:
|
|
1127
|
+
Exception: If DataFrames are not equal
|
|
1128
|
+
"""
|
|
1129
|
+
org_laziness = self.lazy, other.lazy
|
|
1130
|
+
self.lazy = False
|
|
1131
|
+
other.lazy = False
|
|
1132
|
+
self.number_of_records = -1
|
|
1133
|
+
other.number_of_records = -1
|
|
1134
|
+
|
|
1135
|
+
if self.get_number_of_records() != other.get_number_of_records():
|
|
1136
|
+
raise Exception('Number of records is not equal')
|
|
1137
|
+
|
|
1138
|
+
if self.columns != other.columns:
|
|
1139
|
+
raise Exception('Schema is not equal')
|
|
1140
|
+
|
|
1141
|
+
if strict_schema:
|
|
1142
|
+
assert self.data_frame.schema == other.data_frame.schema, 'Data types do not match'
|
|
1143
|
+
|
|
1144
|
+
if ordered:
|
|
1145
|
+
self_lf = self.data_frame.sort(by=self.columns)
|
|
1146
|
+
other_lf = other.data_frame.sort(by=other.columns)
|
|
1147
|
+
else:
|
|
1148
|
+
self_lf = self.data_frame
|
|
1149
|
+
other_lf = other.data_frame
|
|
1150
|
+
|
|
1151
|
+
self.lazy, other.lazy = org_laziness
|
|
1152
|
+
assert self_lf.equals(other_lf), 'Data is not equal'
|
|
1153
|
+
|
|
1154
|
+
# Initialization Methods
|
|
1155
|
+
def initialize_empty_fl(self):
|
|
1156
|
+
"""Initialize an empty LazyFrame."""
|
|
1157
|
+
self.data_frame = pl.LazyFrame()
|
|
1158
|
+
self.number_of_records = 0
|
|
1159
|
+
self._lazy = True
|
|
1160
|
+
|
|
1161
|
+
def get_number_of_records(self, warn: bool = False, force_calculate: bool = False) -> int:
|
|
1162
|
+
"""
|
|
1163
|
+
Get the total number of records in the DataFrame.
|
|
1164
|
+
|
|
1165
|
+
Args:
|
|
1166
|
+
warn: Whether to warn about expensive operations
|
|
1167
|
+
force_calculate: Whether to force recalculation
|
|
1168
|
+
|
|
1169
|
+
Returns:
|
|
1170
|
+
int: Number of records
|
|
1171
|
+
|
|
1172
|
+
Raises:
|
|
1173
|
+
Exception: If unable to get number of records
|
|
1174
|
+
"""
|
|
1175
|
+
if self.is_future and not self.is_collected:
|
|
1176
|
+
return -1
|
|
1177
|
+
|
|
1178
|
+
if self.number_of_records is None or self.number_of_records < 0 or force_calculate:
|
|
1179
|
+
if self._number_of_records_callback is not None:
|
|
1180
|
+
self._number_of_records_callback(self)
|
|
1181
|
+
|
|
1182
|
+
if self.lazy:
|
|
1183
|
+
if warn:
|
|
1184
|
+
logger.warning('Calculating the number of records this can be expensive on a lazy frame')
|
|
1185
|
+
try:
|
|
1186
|
+
self.number_of_records = self.data_frame.select(pl.len()).collect(
|
|
1187
|
+
engine="streaming" if self._streamable else "auto")[0, 0]
|
|
1188
|
+
except Exception:
|
|
1189
|
+
raise Exception('Could not get number of records')
|
|
1190
|
+
else:
|
|
1191
|
+
self.number_of_records = self.data_frame.__len__()
|
|
1192
|
+
|
|
1193
|
+
return self.number_of_records
|
|
1194
|
+
|
|
1195
|
+
# Properties
|
|
1196
|
+
@property
|
|
1197
|
+
def has_errors(self) -> bool:
|
|
1198
|
+
"""Check if there are any errors."""
|
|
1199
|
+
return len(self.errors) > 0
|
|
1200
|
+
|
|
1201
|
+
@property
|
|
1202
|
+
def lazy(self) -> bool:
|
|
1203
|
+
"""Check if DataFrame is lazy."""
|
|
1204
|
+
return self._lazy
|
|
1205
|
+
|
|
1206
|
+
@lazy.setter
|
|
1207
|
+
def lazy(self, exec_lazy: bool = False):
|
|
1208
|
+
"""
|
|
1209
|
+
Set the laziness of the DataFrame.
|
|
1210
|
+
|
|
1211
|
+
Args:
|
|
1212
|
+
exec_lazy: Whether to make DataFrame lazy
|
|
1213
|
+
"""
|
|
1214
|
+
if exec_lazy != self._lazy:
|
|
1215
|
+
if exec_lazy:
|
|
1216
|
+
self.data_frame = self.data_frame.lazy()
|
|
1217
|
+
else:
|
|
1218
|
+
self._lazy = exec_lazy
|
|
1219
|
+
if self.external_source is not None:
|
|
1220
|
+
df = self.collect()
|
|
1221
|
+
self.data_frame = df
|
|
1222
|
+
else:
|
|
1223
|
+
self.data_frame = self.data_frame.collect(engine="streaming" if self._streamable else "auto")
|
|
1224
|
+
self._lazy = exec_lazy
|
|
1225
|
+
|
|
1226
|
+
@property
|
|
1227
|
+
def external_source(self) -> ExternalDataSource:
|
|
1228
|
+
"""Get the external data source."""
|
|
1229
|
+
return self._external_source
|
|
1230
|
+
|
|
1231
|
+
@property
|
|
1232
|
+
def cols_idx(self) -> Dict[str, int]:
|
|
1233
|
+
"""Get column index mapping."""
|
|
1234
|
+
if self._col_idx is None:
|
|
1235
|
+
self._col_idx = {c: i for i, c in enumerate(self.columns)}
|
|
1236
|
+
return self._col_idx
|
|
1237
|
+
|
|
1238
|
+
@property
|
|
1239
|
+
def __name__(self) -> str:
|
|
1240
|
+
"""Get table name."""
|
|
1241
|
+
return self.name
|
|
1242
|
+
|
|
1243
|
+
# Schema and Column Operations
|
|
1244
|
+
def get_select_inputs(self) -> transform_schemas.SelectInputs:
|
|
1245
|
+
"""
|
|
1246
|
+
Get select inputs for all columns.
|
|
1247
|
+
|
|
1248
|
+
Returns:
|
|
1249
|
+
SelectInputs: Input specifications for all columns
|
|
1250
|
+
"""
|
|
1251
|
+
return transform_schemas.SelectInputs(
|
|
1252
|
+
[transform_schemas.SelectInput(old_name=c.name, data_type=c.data_type) for c in self.schema]
|
|
1253
|
+
)
|
|
1254
|
+
|
|
1255
|
+
def select_columns(self, list_select: Union[List[str], Tuple[str], str]) -> "FlowDataEngine":
|
|
1256
|
+
"""
|
|
1257
|
+
Select specific columns from the DataFrame.
|
|
1258
|
+
|
|
1259
|
+
Args:
|
|
1260
|
+
list_select: Columns to select
|
|
1261
|
+
|
|
1262
|
+
Returns:
|
|
1263
|
+
FlowDataEngine: New instance with selected columns
|
|
1264
|
+
"""
|
|
1265
|
+
if isinstance(list_select, str):
|
|
1266
|
+
list_select = [list_select]
|
|
1267
|
+
|
|
1268
|
+
idx_to_keep = [self.cols_idx.get(c) for c in list_select]
|
|
1269
|
+
selects = [ls for ls, id_to_keep in zip(list_select, idx_to_keep) if id_to_keep is not None]
|
|
1270
|
+
new_schema = [self.schema[i] for i in idx_to_keep if i is not None]
|
|
1271
|
+
|
|
1272
|
+
return FlowDataEngine(
|
|
1273
|
+
self.data_frame.select(selects),
|
|
1274
|
+
number_of_records=self.number_of_records,
|
|
1275
|
+
schema=new_schema,
|
|
1276
|
+
streamable=self._streamable
|
|
1277
|
+
)
|
|
1278
|
+
|
|
1279
|
+
def drop_columns(self, columns: List[str]) -> "FlowDataEngine":
|
|
1280
|
+
"""
|
|
1281
|
+
Drop specified columns from the DataFrame.
|
|
1282
|
+
|
|
1283
|
+
Args:
|
|
1284
|
+
columns: Columns to drop
|
|
1285
|
+
|
|
1286
|
+
Returns:
|
|
1287
|
+
FlowDataEngine: New instance without dropped columns
|
|
1288
|
+
"""
|
|
1289
|
+
cols_for_select = tuple(set(self.columns) - set(columns))
|
|
1290
|
+
idx_to_keep = [self.cols_idx.get(c) for c in cols_for_select]
|
|
1291
|
+
new_schema = [self.schema[i] for i in idx_to_keep]
|
|
1292
|
+
|
|
1293
|
+
return FlowDataEngine(
|
|
1294
|
+
self.data_frame.select(cols_for_select),
|
|
1295
|
+
number_of_records=self.number_of_records,
|
|
1296
|
+
schema=new_schema
|
|
1297
|
+
)
|
|
1298
|
+
|
|
1299
|
+
def reorganize_order(self, column_order: List[str]) -> "FlowDataEngine":
|
|
1300
|
+
"""
|
|
1301
|
+
Reorganize columns in specified order.
|
|
1302
|
+
|
|
1303
|
+
Args:
|
|
1304
|
+
column_order: Desired column order
|
|
1305
|
+
|
|
1306
|
+
Returns:
|
|
1307
|
+
FlowDataEngine: New instance with reordered columns
|
|
1308
|
+
"""
|
|
1309
|
+
df = self.data_frame.select(column_order)
|
|
1310
|
+
schema = sorted(self.schema, key=lambda x: column_order.index(x.column_name))
|
|
1311
|
+
return FlowDataEngine(df, schema=schema, number_of_records=self.number_of_records)
|
|
1312
|
+
|
|
1313
|
+
def apply_flowfile_formula(self, func: str, col_name: str,
|
|
1314
|
+
output_data_type: pl.DataType = None) -> "FlowDataEngine":
|
|
1315
|
+
"""
|
|
1316
|
+
Apply a formula to create a new column.
|
|
1317
|
+
|
|
1318
|
+
Args:
|
|
1319
|
+
func: Formula to apply
|
|
1320
|
+
col_name: Name for new column
|
|
1321
|
+
output_data_type: Data type for output
|
|
1322
|
+
|
|
1323
|
+
Returns:
|
|
1324
|
+
FlowDataEngine: New instance with added column
|
|
1325
|
+
"""
|
|
1326
|
+
parsed_func = to_expr(func)
|
|
1327
|
+
if output_data_type is not None:
|
|
1328
|
+
df2 = self.data_frame.with_columns(parsed_func.cast(output_data_type).alias(col_name))
|
|
1329
|
+
else:
|
|
1330
|
+
df2 = self.data_frame.with_columns(parsed_func.alias(col_name))
|
|
1331
|
+
|
|
1332
|
+
return FlowDataEngine(df2, number_of_records=self.number_of_records)
|
|
1333
|
+
|
|
1334
|
+
def apply_sql_formula(self, func: str, col_name: str,
|
|
1335
|
+
output_data_type: pl.DataType = None) -> "FlowDataEngine":
|
|
1336
|
+
"""
|
|
1337
|
+
Apply an SQL-style formula to create a new column.
|
|
1338
|
+
|
|
1339
|
+
Args:
|
|
1340
|
+
func: SQL formula to apply
|
|
1341
|
+
col_name: Name for new column
|
|
1342
|
+
output_data_type: Data type for output
|
|
1343
|
+
|
|
1344
|
+
Returns:
|
|
1345
|
+
FlowDataEngine: New instance with added column
|
|
1346
|
+
"""
|
|
1347
|
+
expr = to_expr(func)
|
|
1348
|
+
if output_data_type is not None:
|
|
1349
|
+
df = self.data_frame.with_columns(expr.cast(output_data_type).alias(col_name))
|
|
1350
|
+
else:
|
|
1351
|
+
df = self.data_frame.with_columns(expr.alias(col_name))
|
|
1352
|
+
|
|
1353
|
+
return FlowDataEngine(df, number_of_records=self.number_of_records)
|
|
1354
|
+
|
|
1355
|
+
def output(self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str,
|
|
1356
|
+
execute_remote: bool = True) -> "FlowDataEngine":
|
|
1357
|
+
"""
|
|
1358
|
+
Write DataFrame to output file.
|
|
1359
|
+
|
|
1360
|
+
Args:
|
|
1361
|
+
output_fs: Output settings.
|
|
1362
|
+
flow_id: Flow ID for tracking.
|
|
1363
|
+
node_id: Node ID for tracking.
|
|
1364
|
+
execute_remote: If the output should be executed at the flowfile worker process.
|
|
1365
|
+
Returns:
|
|
1366
|
+
FlowDataEngine: Self for chaining
|
|
1367
|
+
"""
|
|
1368
|
+
logger.info('Starting to write output')
|
|
1369
|
+
if execute_remote:
|
|
1370
|
+
status = utils.write_output(
|
|
1371
|
+
self.data_frame,
|
|
1372
|
+
data_type=output_fs.file_type,
|
|
1373
|
+
path=output_fs.abs_file_path,
|
|
1374
|
+
write_mode=output_fs.write_mode,
|
|
1375
|
+
sheet_name=output_fs.output_excel_table.sheet_name,
|
|
1376
|
+
delimiter=output_fs.output_csv_table.delimiter,
|
|
1377
|
+
flow_id=flow_id,
|
|
1378
|
+
node_id=node_id
|
|
1379
|
+
)
|
|
1380
|
+
tracker = ExternalExecutorTracker(status)
|
|
1381
|
+
tracker.get_result()
|
|
1382
|
+
logger.info('Finished writing output')
|
|
1383
|
+
else:
|
|
1384
|
+
logger.info("Starting to write results locally")
|
|
1385
|
+
utils.local_write_output(
|
|
1386
|
+
self.data_frame,
|
|
1387
|
+
data_type=output_fs.file_type,
|
|
1388
|
+
path=output_fs.abs_file_path,
|
|
1389
|
+
write_mode=output_fs.write_mode,
|
|
1390
|
+
sheet_name=output_fs.output_excel_table.sheet_name,
|
|
1391
|
+
delimiter=output_fs.output_csv_table.delimiter,
|
|
1392
|
+
flow_id=flow_id,
|
|
1393
|
+
node_id=node_id,
|
|
1394
|
+
)
|
|
1395
|
+
logger.info("Finished writing output")
|
|
1396
|
+
return self
|
|
1397
|
+
|
|
1398
|
+
# Data Operations
|
|
1399
|
+
def make_unique(self, unique_input: transform_schemas.UniqueInput = None) -> "FlowDataEngine":
|
|
1400
|
+
"""
|
|
1401
|
+
Get unique rows based on specified columns.
|
|
1402
|
+
|
|
1403
|
+
Args:
|
|
1404
|
+
unique_input: Unique operation parameters
|
|
1405
|
+
|
|
1406
|
+
Returns:
|
|
1407
|
+
FlowDataEngine: New instance with unique rows
|
|
1408
|
+
"""
|
|
1409
|
+
if unique_input is None or unique_input.columns is None:
|
|
1410
|
+
return FlowDataEngine(self.data_frame.unique())
|
|
1411
|
+
return FlowDataEngine(self.data_frame.unique(unique_input.columns, keep=unique_input.strategy))
|
|
1412
|
+
|
|
1413
|
+
def concat(self, other: Iterable["FlowDataEngine"] | "FlowDataEngine") -> "FlowDataEngine":
|
|
1414
|
+
"""
|
|
1415
|
+
Concatenate with other DataFrames.
|
|
1416
|
+
|
|
1417
|
+
Args:
|
|
1418
|
+
other: DataFrames to concatenate
|
|
1419
|
+
|
|
1420
|
+
Returns:
|
|
1421
|
+
FlowDataEngine: Concatenated DataFrame
|
|
1422
|
+
"""
|
|
1423
|
+
if isinstance(other, FlowDataEngine):
|
|
1424
|
+
other = [other]
|
|
1425
|
+
|
|
1426
|
+
dfs: List[pl.LazyFrame] | List[pl.DataFrame] = [self.data_frame] + [flt.data_frame for flt in other]
|
|
1427
|
+
return FlowDataEngine(pl.concat(dfs, how='diagonal_relaxed'))
|
|
1428
|
+
|
|
1429
|
+
def do_select(self, select_inputs: transform_schemas.SelectInputs,
|
|
1430
|
+
keep_missing: bool = True) -> "FlowDataEngine":
|
|
1431
|
+
"""
|
|
1432
|
+
Perform complex column selection and transformation.
|
|
1433
|
+
|
|
1434
|
+
Args:
|
|
1435
|
+
select_inputs: Selection specifications
|
|
1436
|
+
keep_missing: Whether to keep columns not specified
|
|
1437
|
+
|
|
1438
|
+
Returns:
|
|
1439
|
+
FlowDataEngine: New instance with selected/transformed columns
|
|
1440
|
+
"""
|
|
1441
|
+
new_schema = deepcopy(self.schema)
|
|
1442
|
+
renames = [r for r in select_inputs.renames if r.is_available]
|
|
1443
|
+
|
|
1444
|
+
if not keep_missing:
|
|
1445
|
+
drop_cols = set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames).union(
|
|
1446
|
+
set(r.old_name for r in renames if not r.keep))
|
|
1447
|
+
keep_cols = []
|
|
1448
|
+
else:
|
|
1449
|
+
keep_cols = list(set(self.data_frame.collect_schema().names()) - set(r.old_name for r in renames))
|
|
1450
|
+
drop_cols = set(r.old_name for r in renames if not r.keep)
|
|
1451
|
+
|
|
1452
|
+
if len(drop_cols) > 0:
|
|
1453
|
+
new_schema = [s for s in new_schema if s.name not in drop_cols]
|
|
1454
|
+
new_schema_mapping = {v.name: v for v in new_schema}
|
|
1455
|
+
|
|
1456
|
+
available_renames = []
|
|
1457
|
+
for rename in renames:
|
|
1458
|
+
if (rename.new_name != rename.old_name or rename.new_name not in new_schema_mapping) and rename.keep:
|
|
1459
|
+
schema_entry = new_schema_mapping.get(rename.old_name)
|
|
1460
|
+
if schema_entry is not None:
|
|
1461
|
+
available_renames.append(rename)
|
|
1462
|
+
schema_entry.column_name = rename.new_name
|
|
1463
|
+
|
|
1464
|
+
rename_dict = {r.old_name: r.new_name for r in available_renames}
|
|
1465
|
+
fl = self.select_columns(
|
|
1466
|
+
list_select=[col_to_keep.old_name for col_to_keep in renames if col_to_keep.keep] + keep_cols)
|
|
1467
|
+
fl = fl.change_column_types(transforms=[r for r in renames if r.keep])
|
|
1468
|
+
ndf = fl.data_frame.rename(rename_dict)
|
|
1469
|
+
renames.sort(key=lambda r: 0 if r.position is None else r.position)
|
|
1470
|
+
sorted_cols = utils.match_order(ndf.collect_schema().names(),
|
|
1471
|
+
[r.new_name for r in renames] + self.data_frame.collect_schema().names())
|
|
1472
|
+
output_file = FlowDataEngine(ndf, number_of_records=self.number_of_records)
|
|
1473
|
+
return output_file.reorganize_order(sorted_cols)
|
|
1474
|
+
|
|
1475
|
+
# Utility Methods
|
|
1476
|
+
def set_streamable(self, streamable: bool = False):
|
|
1477
|
+
"""Set whether DataFrame operations should be streamable."""
|
|
1478
|
+
self._streamable = streamable
|
|
1479
|
+
|
|
1480
|
+
def _calculate_schema(self) -> List[Dict]:
|
|
1481
|
+
"""Calculate schema statistics."""
|
|
1482
|
+
if self.external_source is not None:
|
|
1483
|
+
self.collect_external()
|
|
1484
|
+
v = utils.calculate_schema(self.data_frame)
|
|
1485
|
+
return v
|
|
1486
|
+
|
|
1487
|
+
def calculate_schema(self):
|
|
1488
|
+
"""Calculate and return schema."""
|
|
1489
|
+
self._calculate_schema_stats = True
|
|
1490
|
+
return self.schema
|
|
1491
|
+
|
|
1492
|
+
def count(self) -> int:
|
|
1493
|
+
"""Get total number of records."""
|
|
1494
|
+
return self.get_number_of_records()
|
|
1495
|
+
|
|
1496
|
+
@classmethod
|
|
1497
|
+
def create_from_path_worker(cls, received_table: input_schema.ReceivedTable, flow_id: int, node_id: int | str):
|
|
1498
|
+
received_table.set_absolute_filepath()
|
|
1499
|
+
external_fetcher = ExternalCreateFetcher(received_table=received_table,
|
|
1500
|
+
file_type=received_table.file_type, flow_id=flow_id, node_id=node_id)
|
|
1501
|
+
return cls(external_fetcher.get_result())
|
|
1502
|
+
|
|
1503
|
+
|
|
1504
|
+
def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowDataEngine":
|
|
1505
|
+
"""
|
|
1506
|
+
Execute arbitrary Polars code.
|
|
1507
|
+
|
|
1508
|
+
Args:
|
|
1509
|
+
code: Polars code to execute
|
|
1510
|
+
|
|
1511
|
+
Returns:
|
|
1512
|
+
FlowDataEngine: Result of code execution
|
|
1513
|
+
"""
|
|
1514
|
+
polars_executable = polars_code_parser.get_executable(code, num_inputs=len(flowfile_tables))
|
|
1515
|
+
if len(flowfile_tables) == 0:
|
|
1516
|
+
kwargs = {}
|
|
1517
|
+
elif len(flowfile_tables) == 1:
|
|
1518
|
+
kwargs = {'input_df': flowfile_tables[0].data_frame}
|
|
1519
|
+
else:
|
|
1520
|
+
kwargs = {f'input_df_{i+1}': flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
|
|
1521
|
+
return FlowDataEngine(polars_executable(**kwargs))
|