Flowfile 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +27 -6
- flowfile/api.py +1 -0
- flowfile/web/__init__.py +2 -2
- flowfile/web/static/assets/{CloudConnectionManager-d004942f.js → CloudConnectionManager-c20a740f.js} +3 -4
- flowfile/web/static/assets/{CloudStorageReader-eccf9fc2.js → CloudStorageReader-960b400a.js} +7 -7
- flowfile/web/static/assets/{CloudStorageWriter-b1ba6bba.js → CloudStorageWriter-e3decbdd.js} +7 -7
- flowfile/web/static/assets/{CrossJoin-68981877.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-0b06649c.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-8349a426.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-905344f8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-9f5b8638.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-131a6d53.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-e3549dcc.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-6e0730ae.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-02f033e6.js → Formula-4207ea31.js} +8 -8
- flowfile/web/static/assets/{FuzzyMatch-54c14036.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-08a3f499.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-2ae38139.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-493b9772.js → Join-4e49a274.js} +9 -9
- flowfile/web/static/assets/{ManualInput-4373d163.js → ManualInput-90998ae8.js} +5 -5
- flowfile/web/static/assets/{Output-b534f3c7.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-2968ff65.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-65136536.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-c56339ed.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-1c641a5e.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-df308b8f.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-293e8a64.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-03911655.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-3058a13d.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-fbf4fb39.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-a29bbaf7.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-c7d7760e.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-118f1d20.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-f0589571.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-7329a207.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-30b0be15.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/{api-fb67319c.js → api-6ef0dcef.js} +1 -1
- flowfile/web/static/assets/{api-602fb95c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/{designer-94a6bf4d.js → designer-13eabd83.js} +4 -4
- flowfile/web/static/assets/{documentation-a224831e.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-c2d2aa97.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-921ac5fd.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-7013cc94.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-3a75211d.js → index-f6c15e76.js} +46 -22
- flowfile/web/static/assets/{nodeTitle-a63d4680.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-763aec6e.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-08464729.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-f15a5f87.js → vue-codemirror.esm-2847001e.js} +1 -1
- flowfile/web/static/assets/{vue-content-loader.es-93bd09d7.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/METADATA +2 -2
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/RECORD +96 -94
- flowfile_core/__init__.py +1 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +1 -0
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/flowfile/code_generator/code_generator.py +71 -0
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +1 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +597 -309
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_graph.py +619 -191
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +500 -89
- flowfile_core/flowfile/flow_node/models.py +125 -20
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +36 -5
- flowfile_core/main.py +32 -13
- flowfile_core/routes/cloud_connections.py +7 -11
- flowfile_core/routes/logs.py +2 -6
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +127 -51
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/input_schema.py +92 -64
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +144 -11
- flowfile_core/schemas/transform_schema.py +82 -17
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/__init__.py +0 -0
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +28 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +232 -110
- flowfile_frame/flow_frame.pyi +140 -91
- flowfile_frame/flow_frame_methods.py +150 -12
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- test_utils/s3/data_generator.py +1 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +6 -1
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/models.py +0 -193
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +0 -0
|
@@ -4,7 +4,7 @@ import os
|
|
|
4
4
|
from copy import deepcopy
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from math import ceil
|
|
7
|
-
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar
|
|
7
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, TypeVar, Literal, Generator
|
|
8
8
|
|
|
9
9
|
# Third-party imports
|
|
10
10
|
from loky import Future
|
|
@@ -12,6 +12,7 @@ import polars as pl
|
|
|
12
12
|
from polars.exceptions import PanicException
|
|
13
13
|
from polars_grouper import graph_solver
|
|
14
14
|
from polars_expr_transformer import simple_function_to_expr as to_expr
|
|
15
|
+
from pyarrow import Table as PaTable
|
|
15
16
|
from pyarrow.parquet import ParquetFile
|
|
16
17
|
|
|
17
18
|
# Local imports - Core
|
|
@@ -64,7 +65,24 @@ from flowfile_core.flowfile.sources.external_sources.base_class import ExternalD
|
|
|
64
65
|
T = TypeVar('T', pl.DataFrame, pl.LazyFrame)
|
|
65
66
|
|
|
66
67
|
def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform_schemas.JoinInput) -> Tuple[T, T, Dict[str, str]]:
|
|
68
|
+
"""Temporarily renames join keys to avoid conflicts during a join.
|
|
67
69
|
|
|
70
|
+
This helper function checks the join type and renames the join key columns
|
|
71
|
+
in either the left or right DataFrame to a temporary name (`__FL_TEMP__...`).
|
|
72
|
+
This prevents Polars from automatically suffixing columns with `_right` when
|
|
73
|
+
join keys have the same name.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
left_df: The left Polars DataFrame or LazyFrame.
|
|
77
|
+
right_df: The right Polars DataFrame or LazyFrame.
|
|
78
|
+
join_input: The JoinInput settings object defining the join.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
A tuple containing:
|
|
82
|
+
- The (potentially modified) left DataFrame.
|
|
83
|
+
- The (potentially modified) right DataFrame.
|
|
84
|
+
- A dictionary mapping the temporary names back to their desired final names.
|
|
85
|
+
"""
|
|
68
86
|
def _construct_temp_name(column_name: str) -> str:
|
|
69
87
|
return "__FL_TEMP__"+column_name
|
|
70
88
|
if join_input.how == 'right':
|
|
@@ -85,13 +103,15 @@ def _handle_duplication_join_keys(left_df: T, right_df: T, join_input: transform
|
|
|
85
103
|
|
|
86
104
|
|
|
87
105
|
def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.JoinInput) -> None:
|
|
88
|
-
"""
|
|
89
|
-
Updates the right columns of the join input by deselecting them.
|
|
90
|
-
Args:
|
|
91
|
-
join_input ():
|
|
106
|
+
"""Modifies JoinInput for semi/anti joins to not keep right-side columns.
|
|
92
107
|
|
|
93
|
-
|
|
94
|
-
|
|
108
|
+
For 'semi' and 'anti' joins, Polars only returns columns from the left
|
|
109
|
+
DataFrame. This function enforces that behavior by modifying the `join_input`
|
|
110
|
+
in-place, setting the `keep` flag to `False` for all columns in the
|
|
111
|
+
right-side selection.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
join_input: The JoinInput settings object to modify.
|
|
95
115
|
"""
|
|
96
116
|
if join_input.how in ('semi', 'anti'):
|
|
97
117
|
for jk in join_input.right_select.renames:
|
|
@@ -99,31 +119,38 @@ def ensure_right_unselect_for_semi_and_anti_joins(join_input: transform_schemas.
|
|
|
99
119
|
|
|
100
120
|
|
|
101
121
|
def get_select_columns(full_select_input: List[transform_schemas.SelectInput]) -> List[str]:
|
|
102
|
-
"""
|
|
103
|
-
|
|
104
|
-
|
|
122
|
+
"""Extracts a list of column names to be selected from a SelectInput list.
|
|
123
|
+
|
|
124
|
+
This function filters a list of `SelectInput` objects to return the names
|
|
125
|
+
of columns that are marked as available and are either a join key or
|
|
126
|
+
explicitly marked to be kept.
|
|
127
|
+
|
|
105
128
|
Args:
|
|
106
|
-
full_select_input
|
|
129
|
+
full_select_input: A list of SelectInput objects.
|
|
107
130
|
|
|
108
131
|
Returns:
|
|
109
|
-
|
|
132
|
+
A list of column names to be selected.
|
|
110
133
|
"""
|
|
111
134
|
return [v.old_name for v in full_select_input if (v.keep or v.join_key) and v.is_available]
|
|
112
135
|
|
|
113
136
|
|
|
114
137
|
@dataclass
|
|
115
138
|
class FlowDataEngine:
|
|
139
|
+
"""The core data handling engine for Flowfile.
|
|
140
|
+
|
|
141
|
+
This class acts as a high-level wrapper around a Polars DataFrame or
|
|
142
|
+
LazyFrame, providing a unified API for data ingestion, transformation,
|
|
143
|
+
and output. It manages data state (lazy vs. eager), schema information,
|
|
144
|
+
and execution logic.
|
|
145
|
+
|
|
146
|
+
Attributes:
|
|
147
|
+
_data_frame: The underlying Polars DataFrame or LazyFrame.
|
|
148
|
+
columns: A list of column names in the current data frame.
|
|
149
|
+
name: An optional name for the data engine instance.
|
|
150
|
+
number_of_records: The number of records. Can be -1 for lazy frames.
|
|
151
|
+
errors: A list of errors encountered during operations.
|
|
152
|
+
_schema: A cached list of `FlowfileColumn` objects representing the schema.
|
|
116
153
|
"""
|
|
117
|
-
A class that provides a unified interface for working with tabular data, supporting both eager and lazy evaluation.
|
|
118
|
-
|
|
119
|
-
The class is organized into several logical sections:
|
|
120
|
-
1. Core properties and initialization
|
|
121
|
-
2. Data access and manipulation
|
|
122
|
-
3. Schema and metadata operations
|
|
123
|
-
4. Transformations and operations
|
|
124
|
-
5. I/O operations
|
|
125
|
-
"""
|
|
126
|
-
|
|
127
154
|
# Core attributes
|
|
128
155
|
_data_frame: Union[pl.DataFrame, pl.LazyFrame]
|
|
129
156
|
columns: List[Any]
|
|
@@ -163,9 +190,6 @@ class FlowDataEngine:
|
|
|
163
190
|
_number_of_records_callback: Callable = None
|
|
164
191
|
_data_callback: Callable = None
|
|
165
192
|
|
|
166
|
-
# Tracking info
|
|
167
|
-
# node_id: int = None # TODO: Implement node_id
|
|
168
|
-
# flow_id: int = None # TODO: Implement flow_id
|
|
169
193
|
|
|
170
194
|
def __init__(self,
|
|
171
195
|
raw_data: Union[List[Dict], List[Any], Dict[str, Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
|
|
@@ -178,7 +202,22 @@ class FlowDataEngine:
|
|
|
178
202
|
streamable: bool = True,
|
|
179
203
|
number_of_records_callback: Callable = None,
|
|
180
204
|
data_callback: Callable = None):
|
|
181
|
-
"""
|
|
205
|
+
"""Initializes the FlowDataEngine from various data sources.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
raw_data: The input data. Can be a list of dicts, a Polars DataFrame/LazyFrame,
|
|
209
|
+
or a `RawData` schema object.
|
|
210
|
+
path_ref: A string path to a Parquet file.
|
|
211
|
+
name: An optional name for the data engine instance.
|
|
212
|
+
optimize_memory: If True, prefers lazy operations to conserve memory.
|
|
213
|
+
schema: An optional schema definition. Can be a list of `FlowfileColumn` objects,
|
|
214
|
+
a list of column names, or a Polars `Schema`.
|
|
215
|
+
number_of_records: The number of records, if known.
|
|
216
|
+
calculate_schema_stats: If True, computes detailed statistics for each column.
|
|
217
|
+
streamable: If True, allows for streaming operations when possible.
|
|
218
|
+
number_of_records_callback: A callback function to retrieve the number of records.
|
|
219
|
+
data_callback: A callback function to retrieve the data.
|
|
220
|
+
"""
|
|
182
221
|
self._initialize_attributes(number_of_records_callback, data_callback, streamable)
|
|
183
222
|
|
|
184
223
|
if raw_data is not None:
|
|
@@ -190,7 +229,11 @@ class FlowDataEngine:
|
|
|
190
229
|
self._finalize_initialization(name, optimize_memory, schema, calculate_schema_stats)
|
|
191
230
|
|
|
192
231
|
def _initialize_attributes(self, number_of_records_callback, data_callback, streamable):
|
|
193
|
-
"""
|
|
232
|
+
"""(Internal) Sets the initial default attributes for a new instance.
|
|
233
|
+
|
|
234
|
+
This helper is called first during initialization to ensure all state-tracking
|
|
235
|
+
and configuration attributes have a clean default value before data is processed.
|
|
236
|
+
"""
|
|
194
237
|
self._external_source = None
|
|
195
238
|
self._number_of_records_callback = number_of_records_callback
|
|
196
239
|
self._data_callback = data_callback
|
|
@@ -204,8 +247,11 @@ class FlowDataEngine:
|
|
|
204
247
|
self.is_future = False
|
|
205
248
|
|
|
206
249
|
def _handle_raw_data(self, raw_data, number_of_records, optimize_memory):
|
|
207
|
-
"""
|
|
250
|
+
"""(Internal) Dispatches raw data to the appropriate handler based on its type.
|
|
208
251
|
|
|
252
|
+
This acts as a router during initialization, inspecting the type of `raw_data`
|
|
253
|
+
and calling the corresponding specialized `_handle_*` method to process it.
|
|
254
|
+
"""
|
|
209
255
|
if isinstance(raw_data, input_schema.RawData):
|
|
210
256
|
self._handle_raw_data_format(raw_data)
|
|
211
257
|
elif isinstance(raw_data, pl.DataFrame):
|
|
@@ -216,12 +262,12 @@ class FlowDataEngine:
|
|
|
216
262
|
self._handle_python_data(raw_data)
|
|
217
263
|
|
|
218
264
|
def _handle_polars_dataframe(self, df: pl.DataFrame, number_of_records: Optional[int]):
|
|
219
|
-
"""
|
|
265
|
+
"""(Internal) Initializes the engine from an eager Polars DataFrame."""
|
|
220
266
|
self.data_frame = df
|
|
221
267
|
self.number_of_records = number_of_records or df.select(pl.len())[0, 0]
|
|
222
268
|
|
|
223
269
|
def _handle_polars_lazy_frame(self, lf: pl.LazyFrame, number_of_records: Optional[int], optimize_memory: bool):
|
|
224
|
-
"""
|
|
270
|
+
"""(Internal) Initializes the engine from a Polars LazyFrame."""
|
|
225
271
|
self.data_frame = lf
|
|
226
272
|
self._lazy = True
|
|
227
273
|
if number_of_records is not None:
|
|
@@ -229,18 +275,17 @@ class FlowDataEngine:
|
|
|
229
275
|
elif optimize_memory:
|
|
230
276
|
self.number_of_records = -1
|
|
231
277
|
else:
|
|
232
|
-
# TODO: assess whether this leads to slow downs with multi remote files
|
|
233
278
|
self.number_of_records = lf.select(pl.len()).collect()[0, 0]
|
|
234
279
|
|
|
235
280
|
def _handle_python_data(self, data: Union[List, Dict]):
|
|
236
|
-
"""
|
|
281
|
+
"""(Internal) Dispatches Python collections to the correct handler."""
|
|
237
282
|
if isinstance(data, dict):
|
|
238
283
|
self._handle_dict_input(data)
|
|
239
284
|
else:
|
|
240
285
|
self._handle_list_input(data)
|
|
241
286
|
|
|
242
287
|
def _handle_dict_input(self, data: Dict):
|
|
243
|
-
"""
|
|
288
|
+
"""(Internal) Initializes the engine from a Python dictionary."""
|
|
244
289
|
if len(data) == 0:
|
|
245
290
|
self.initialize_empty_fl()
|
|
246
291
|
lengths = [len(v) if isinstance(v, (list, tuple)) else 1 for v in data.values()]
|
|
@@ -254,7 +299,14 @@ class FlowDataEngine:
|
|
|
254
299
|
self.lazy = True
|
|
255
300
|
|
|
256
301
|
def _handle_raw_data_format(self, raw_data: input_schema.RawData):
|
|
257
|
-
"""
|
|
302
|
+
"""(Internal) Initializes the engine from a `RawData` schema object.
|
|
303
|
+
|
|
304
|
+
This method uses the schema provided in the `RawData` object to correctly
|
|
305
|
+
infer data types when creating the Polars DataFrame.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
raw_data: An instance of `RawData` containing the data and schema.
|
|
309
|
+
"""
|
|
258
310
|
flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
|
|
259
311
|
polars_schema = pl.Schema([(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
|
|
260
312
|
for flowfile_column in flowfile_schema])
|
|
@@ -268,7 +320,7 @@ class FlowDataEngine:
|
|
|
268
320
|
self.lazy = True
|
|
269
321
|
|
|
270
322
|
def _handle_list_input(self, data: List):
|
|
271
|
-
"""
|
|
323
|
+
"""(Internal) Initializes the engine from a list of records."""
|
|
272
324
|
number_of_records = len(data)
|
|
273
325
|
if number_of_records > 0:
|
|
274
326
|
processed_data = self._process_list_data(data)
|
|
@@ -281,7 +333,11 @@ class FlowDataEngine:
|
|
|
281
333
|
|
|
282
334
|
@staticmethod
|
|
283
335
|
def _process_list_data(data: List) -> List[Dict]:
|
|
284
|
-
"""
|
|
336
|
+
"""(Internal) Normalizes list data into a list of dictionaries.
|
|
337
|
+
|
|
338
|
+
Ensures that a list of objects or non-dict items is converted into a
|
|
339
|
+
uniform list of dictionaries suitable for Polars DataFrame creation.
|
|
340
|
+
"""
|
|
285
341
|
if not (isinstance(data[0], dict) or hasattr(data[0], '__dict__')):
|
|
286
342
|
try:
|
|
287
343
|
return pl.DataFrame(data).to_dicts()
|
|
@@ -296,19 +352,19 @@ class FlowDataEngine:
|
|
|
296
352
|
return ensure_similarity_dicts(data)
|
|
297
353
|
|
|
298
354
|
def to_cloud_storage_obj(self, settings: cloud_storage_schemas.CloudStorageWriteSettingsInternal):
|
|
299
|
-
"""
|
|
300
|
-
Write the FlowDataEngine's data to an object in cloud storage.
|
|
355
|
+
"""Writes the DataFrame to an object in cloud storage.
|
|
301
356
|
|
|
302
|
-
|
|
303
|
-
|
|
357
|
+
This method supports writing to various cloud storage providers like AWS S3,
|
|
358
|
+
Azure Data Lake Storage, and Google Cloud Storage.
|
|
304
359
|
|
|
305
360
|
Args:
|
|
306
|
-
settings:
|
|
361
|
+
settings: A `CloudStorageWriteSettingsInternal` object containing connection
|
|
362
|
+
details, file format, and write options.
|
|
307
363
|
|
|
308
364
|
Raises:
|
|
309
|
-
ValueError: If file format is not supported.
|
|
310
|
-
NotImplementedError: If the 'append' write mode is used.
|
|
311
|
-
Exception: If
|
|
365
|
+
ValueError: If the specified file format is not supported for writing.
|
|
366
|
+
NotImplementedError: If the 'append' write mode is used with an unsupported format.
|
|
367
|
+
Exception: If the write operation to cloud storage fails for any reason.
|
|
312
368
|
"""
|
|
313
369
|
connection = settings.connection
|
|
314
370
|
write_settings = settings.write_settings
|
|
@@ -317,7 +373,6 @@ class FlowDataEngine:
|
|
|
317
373
|
|
|
318
374
|
if write_settings.write_mode == 'append' and write_settings.file_format != "delta":
|
|
319
375
|
raise NotImplementedError("The 'append' write mode is not yet supported for this destination.")
|
|
320
|
-
|
|
321
376
|
storage_options = CloudStorageReader.get_storage_options(connection)
|
|
322
377
|
credential_provider = CloudStorageReader.get_credential_provider(connection)
|
|
323
378
|
# Dispatch to the correct writer based on file format
|
|
@@ -359,7 +414,11 @@ class FlowDataEngine:
|
|
|
359
414
|
storage_options: Dict[str, Any],
|
|
360
415
|
credential_provider: Optional[Callable],
|
|
361
416
|
write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
|
|
362
|
-
"""
|
|
417
|
+
"""(Internal) Writes the DataFrame to a Parquet file in cloud storage.
|
|
418
|
+
|
|
419
|
+
Uses `sink_parquet` for efficient streaming writes. Falls back to a
|
|
420
|
+
collect-then-write pattern if sinking fails.
|
|
421
|
+
"""
|
|
363
422
|
try:
|
|
364
423
|
sink_kwargs = {
|
|
365
424
|
"path": resource_path,
|
|
@@ -371,7 +430,8 @@ class FlowDataEngine:
|
|
|
371
430
|
sink_kwargs["credential_provider"] = credential_provider
|
|
372
431
|
try:
|
|
373
432
|
self.data_frame.sink_parquet(**sink_kwargs)
|
|
374
|
-
except:
|
|
433
|
+
except Exception as e:
|
|
434
|
+
logger.warning(f"Failed to sink the data, falling back to collecing and writing. \n {e}")
|
|
375
435
|
pl_df = self.collect()
|
|
376
436
|
sink_kwargs['file'] = sink_kwargs.pop("path")
|
|
377
437
|
pl_df.write_parquet(**sink_kwargs)
|
|
@@ -385,6 +445,11 @@ class FlowDataEngine:
|
|
|
385
445
|
storage_options: Dict[str, Any],
|
|
386
446
|
credential_provider: Optional[Callable],
|
|
387
447
|
write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
|
|
448
|
+
"""(Internal) Writes the DataFrame to a Delta Lake table in cloud storage.
|
|
449
|
+
|
|
450
|
+
This operation requires collecting the data first, as `write_delta` operates
|
|
451
|
+
on an eager DataFrame.
|
|
452
|
+
"""
|
|
388
453
|
sink_kwargs = {
|
|
389
454
|
"target": resource_path,
|
|
390
455
|
"mode": write_settings.write_mode,
|
|
@@ -400,7 +465,10 @@ class FlowDataEngine:
|
|
|
400
465
|
storage_options: Dict[str, Any],
|
|
401
466
|
credential_provider: Optional[Callable],
|
|
402
467
|
write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
|
|
403
|
-
"""
|
|
468
|
+
"""(Internal) Writes the DataFrame to a CSV file in cloud storage.
|
|
469
|
+
|
|
470
|
+
Uses `sink_csv` for efficient, streaming writes of the data.
|
|
471
|
+
"""
|
|
404
472
|
try:
|
|
405
473
|
sink_kwargs = {
|
|
406
474
|
"path": resource_path,
|
|
@@ -423,7 +491,10 @@ class FlowDataEngine:
|
|
|
423
491
|
storage_options: Dict[str, Any],
|
|
424
492
|
credential_provider: Optional[Callable],
|
|
425
493
|
write_settings: cloud_storage_schemas.CloudStorageWriteSettings):
|
|
426
|
-
"""
|
|
494
|
+
"""(Internal) Writes the DataFrame to a line-delimited JSON (NDJSON) file.
|
|
495
|
+
|
|
496
|
+
Uses `sink_ndjson` for efficient, streaming writes.
|
|
497
|
+
"""
|
|
427
498
|
try:
|
|
428
499
|
sink_kwargs = {"path": resource_path}
|
|
429
500
|
if storage_options:
|
|
@@ -437,22 +508,25 @@ class FlowDataEngine:
|
|
|
437
508
|
raise Exception(f"Failed to write JSON to cloud storage: {str(e)}")
|
|
438
509
|
|
|
439
510
|
@classmethod
|
|
440
|
-
def from_cloud_storage_obj(cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal):
|
|
441
|
-
"""
|
|
442
|
-
Create a FlowDataEngine from an object in cloud storage.
|
|
511
|
+
def from_cloud_storage_obj(cls, settings: cloud_storage_schemas.CloudStorageReadSettingsInternal) -> "FlowDataEngine":
|
|
512
|
+
"""Creates a FlowDataEngine from an object in cloud storage.
|
|
443
513
|
|
|
444
|
-
|
|
445
|
-
|
|
514
|
+
This method supports reading from various cloud storage providers like AWS S3,
|
|
515
|
+
Azure Data Lake Storage, and Google Cloud Storage, with support for
|
|
516
|
+
various authentication methods.
|
|
446
517
|
|
|
447
518
|
Args:
|
|
448
|
-
settings:
|
|
519
|
+
settings: A `CloudStorageReadSettingsInternal` object containing connection
|
|
520
|
+
details, file format, and read options.
|
|
449
521
|
|
|
450
522
|
Returns:
|
|
451
|
-
FlowDataEngine
|
|
523
|
+
A new `FlowDataEngine` instance containing the data from cloud storage.
|
|
452
524
|
|
|
453
525
|
Raises:
|
|
454
|
-
ValueError: If storage type or file format is not supported
|
|
455
|
-
|
|
526
|
+
ValueError: If the storage type or file format is not supported.
|
|
527
|
+
NotImplementedError: If a requested file format like "delta" or "iceberg"
|
|
528
|
+
is not yet implemented.
|
|
529
|
+
Exception: If reading from cloud storage fails.
|
|
456
530
|
"""
|
|
457
531
|
connection = settings.connection
|
|
458
532
|
read_settings = settings.read_settings
|
|
@@ -505,11 +579,14 @@ class FlowDataEngine:
|
|
|
505
579
|
raise ValueError(f"Unsupported file format: {read_settings.file_format}")
|
|
506
580
|
|
|
507
581
|
@staticmethod
|
|
508
|
-
def _get_schema_from_first_file_in_dir(source: str, storage_options: Dict[str, Any]
|
|
582
|
+
def _get_schema_from_first_file_in_dir(source: str, storage_options: Dict[str, Any],
|
|
583
|
+
file_format: Literal["csv", "parquet", "json", "delta"]) -> List[FlowfileColumn] | None:
|
|
584
|
+
"""Infers the schema by scanning the first file in a cloud directory."""
|
|
509
585
|
try:
|
|
586
|
+
scan_func = getattr(pl, "scan_" + file_format)
|
|
510
587
|
first_file_ref = get_first_file_from_s3_dir(source, storage_options=storage_options)
|
|
511
588
|
return convert_stats_to_column_info(FlowDataEngine._create_schema_stats_from_pl_schema(
|
|
512
|
-
|
|
589
|
+
scan_func(first_file_ref, storage_options=storage_options).collect_schema()))
|
|
513
590
|
except Exception as e:
|
|
514
591
|
logger.warning(f"Could not read schema from first file in directory, using default schema: {e}")
|
|
515
592
|
|
|
@@ -520,7 +597,7 @@ class FlowDataEngine:
|
|
|
520
597
|
storage_options: Dict[str, Any],
|
|
521
598
|
credential_provider: Optional[Callable],
|
|
522
599
|
read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
|
|
523
|
-
"""
|
|
600
|
+
"""Reads Iceberg table(s) from cloud storage."""
|
|
524
601
|
raise NotImplementedError(f"Failed to read Iceberg table from cloud storage: Not yet implemented")
|
|
525
602
|
|
|
526
603
|
@classmethod
|
|
@@ -529,7 +606,7 @@ class FlowDataEngine:
|
|
|
529
606
|
storage_options: Dict[str, Any],
|
|
530
607
|
credential_provider: Optional[Callable],
|
|
531
608
|
is_directory: bool) -> "FlowDataEngine":
|
|
532
|
-
"""
|
|
609
|
+
"""Reads Parquet file(s) from cloud storage."""
|
|
533
610
|
try:
|
|
534
611
|
# Use scan_parquet for lazy evaluation
|
|
535
612
|
if is_directory:
|
|
@@ -542,14 +619,14 @@ class FlowDataEngine:
|
|
|
542
619
|
if credential_provider:
|
|
543
620
|
scan_kwargs["credential_provider"] = credential_provider
|
|
544
621
|
if storage_options and is_directory:
|
|
545
|
-
schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options)
|
|
622
|
+
schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options, "parquet")
|
|
546
623
|
else:
|
|
547
624
|
schema = None
|
|
548
625
|
lf = pl.scan_parquet(**scan_kwargs)
|
|
549
626
|
|
|
550
627
|
return cls(
|
|
551
628
|
lf,
|
|
552
|
-
number_of_records=6_666_666, # Set
|
|
629
|
+
number_of_records=6_666_666, # Set so the provider is not accessed for this stat
|
|
553
630
|
optimize_memory=True,
|
|
554
631
|
streamable=True,
|
|
555
632
|
schema=schema
|
|
@@ -565,6 +642,7 @@ class FlowDataEngine:
|
|
|
565
642
|
storage_options: Dict[str, Any],
|
|
566
643
|
credential_provider: Optional[Callable],
|
|
567
644
|
read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
|
|
645
|
+
"""Reads a Delta Lake table from cloud storage."""
|
|
568
646
|
try:
|
|
569
647
|
logger.info("Reading Delta file from cloud storage...")
|
|
570
648
|
logger.info(f"read_settings: {read_settings}")
|
|
@@ -579,7 +657,7 @@ class FlowDataEngine:
|
|
|
579
657
|
|
|
580
658
|
return cls(
|
|
581
659
|
lf,
|
|
582
|
-
number_of_records=6_666_666, # Set
|
|
660
|
+
number_of_records=6_666_666, # Set so the provider is not accessed for this stat
|
|
583
661
|
optimize_memory=True,
|
|
584
662
|
streamable=True
|
|
585
663
|
)
|
|
@@ -593,7 +671,7 @@ class FlowDataEngine:
|
|
|
593
671
|
storage_options: Dict[str, Any],
|
|
594
672
|
credential_provider: Optional[Callable],
|
|
595
673
|
read_settings: cloud_storage_schemas.CloudStorageReadSettings) -> "FlowDataEngine":
|
|
596
|
-
"""
|
|
674
|
+
"""Reads CSV file(s) from cloud storage."""
|
|
597
675
|
try:
|
|
598
676
|
scan_kwargs = {
|
|
599
677
|
"source": resource_path,
|
|
@@ -610,7 +688,7 @@ class FlowDataEngine:
|
|
|
610
688
|
resource_path = ensure_path_has_wildcard_pattern(resource_path=resource_path, file_format="csv")
|
|
611
689
|
scan_kwargs["source"] = resource_path
|
|
612
690
|
if storage_options and read_settings.scan_mode == "directory":
|
|
613
|
-
schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options)
|
|
691
|
+
schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options, "csv")
|
|
614
692
|
else:
|
|
615
693
|
schema = None
|
|
616
694
|
|
|
@@ -634,8 +712,10 @@ class FlowDataEngine:
|
|
|
634
712
|
storage_options: Dict[str, Any],
|
|
635
713
|
credential_provider: Optional[Callable],
|
|
636
714
|
is_directory: bool) -> "FlowDataEngine":
|
|
637
|
-
"""
|
|
715
|
+
"""Reads JSON file(s) from cloud storage."""
|
|
638
716
|
try:
|
|
717
|
+
if is_directory:
|
|
718
|
+
resource_path = ensure_path_has_wildcard_pattern(resource_path, "json")
|
|
639
719
|
scan_kwargs = {"source": resource_path}
|
|
640
720
|
|
|
641
721
|
if storage_options:
|
|
@@ -643,13 +723,6 @@ class FlowDataEngine:
|
|
|
643
723
|
if credential_provider:
|
|
644
724
|
scan_kwargs["credential_provider"] = credential_provider
|
|
645
725
|
|
|
646
|
-
if is_directory:
|
|
647
|
-
resource_path = ensure_path_has_wildcard_pattern(resource_path, "json")
|
|
648
|
-
if storage_options and is_directory:
|
|
649
|
-
schema = cls._get_schema_from_first_file_in_dir(resource_path, storage_options)
|
|
650
|
-
else:
|
|
651
|
-
schema = None
|
|
652
|
-
|
|
653
726
|
lf = pl.scan_ndjson(**scan_kwargs) # Using NDJSON for line-delimited JSON
|
|
654
727
|
|
|
655
728
|
return cls(
|
|
@@ -657,7 +730,6 @@ class FlowDataEngine:
|
|
|
657
730
|
number_of_records=-1,
|
|
658
731
|
optimize_memory=True,
|
|
659
732
|
streamable=True,
|
|
660
|
-
schema=schema
|
|
661
733
|
)
|
|
662
734
|
|
|
663
735
|
except Exception as e:
|
|
@@ -665,7 +737,7 @@ class FlowDataEngine:
|
|
|
665
737
|
raise Exception(f"Failed to read JSON from cloud storage: {str(e)}")
|
|
666
738
|
|
|
667
739
|
def _handle_path_ref(self, path_ref: str, optimize_memory: bool):
|
|
668
|
-
"""
|
|
740
|
+
"""Handles file path reference input."""
|
|
669
741
|
try:
|
|
670
742
|
pf = ParquetFile(path_ref)
|
|
671
743
|
except Exception as e:
|
|
@@ -681,7 +753,7 @@ class FlowDataEngine:
|
|
|
681
753
|
|
|
682
754
|
def _finalize_initialization(self, name: str, optimize_memory: bool, schema: Optional[Any],
|
|
683
755
|
calculate_schema_stats: bool):
|
|
684
|
-
"""
|
|
756
|
+
"""Finalizes initialization by setting remaining attributes."""
|
|
685
757
|
_ = calculate_schema_stats
|
|
686
758
|
self.name = name
|
|
687
759
|
self._optimize_memory = optimize_memory
|
|
@@ -694,12 +766,19 @@ class FlowDataEngine:
|
|
|
694
766
|
self.columns = [c.column_name for c in self._schema] if self._schema else pl_schema.names()
|
|
695
767
|
|
|
696
768
|
def __getitem__(self, item):
|
|
697
|
-
"""
|
|
769
|
+
"""Accesses a specific column or item from the DataFrame."""
|
|
698
770
|
return self.data_frame.select([item])
|
|
699
771
|
|
|
700
772
|
@property
|
|
701
773
|
def data_frame(self) -> pl.LazyFrame | pl.DataFrame | None:
|
|
702
|
-
"""
|
|
774
|
+
"""The underlying Polars DataFrame or LazyFrame.
|
|
775
|
+
|
|
776
|
+
This property provides access to the Polars object that backs the
|
|
777
|
+
FlowDataEngine. It handles lazy-loading from external sources if necessary.
|
|
778
|
+
|
|
779
|
+
Returns:
|
|
780
|
+
The active Polars `DataFrame` or `LazyFrame`.
|
|
781
|
+
"""
|
|
703
782
|
if self._data_frame is not None and not self.is_future:
|
|
704
783
|
return self._data_frame
|
|
705
784
|
elif self.is_future:
|
|
@@ -718,24 +797,32 @@ class FlowDataEngine:
|
|
|
718
797
|
|
|
719
798
|
@data_frame.setter
|
|
720
799
|
def data_frame(self, df: pl.LazyFrame | pl.DataFrame):
|
|
721
|
-
"""
|
|
800
|
+
"""Sets the underlying Polars DataFrame or LazyFrame."""
|
|
722
801
|
if self.lazy and isinstance(df, pl.DataFrame):
|
|
723
802
|
raise Exception('Cannot set a non-lazy dataframe to a lazy flowfile')
|
|
724
803
|
self._data_frame = df
|
|
725
804
|
|
|
726
805
|
@staticmethod
|
|
727
806
|
def _create_schema_stats_from_pl_schema(pl_schema: pl.Schema) -> List[Dict]:
|
|
807
|
+
"""Converts a Polars Schema into a list of schema statistics dictionaries."""
|
|
728
808
|
return [
|
|
729
809
|
dict(column_name=k, pl_datatype=v, col_index=i)
|
|
730
810
|
for i, (k, v) in enumerate(pl_schema.items())
|
|
731
811
|
]
|
|
732
812
|
|
|
733
813
|
def _add_schema_from_schema_stats(self, schema_stats: List[Dict]):
|
|
814
|
+
"""Populates the schema from a list of schema statistics dictionaries."""
|
|
734
815
|
self._schema = convert_stats_to_column_info(schema_stats)
|
|
735
816
|
|
|
736
817
|
@property
|
|
737
818
|
def schema(self) -> List[FlowfileColumn]:
|
|
738
|
-
"""
|
|
819
|
+
"""The schema of the DataFrame as a list of `FlowfileColumn` objects.
|
|
820
|
+
|
|
821
|
+
This property lazily calculates the schema if it hasn't been determined yet.
|
|
822
|
+
|
|
823
|
+
Returns:
|
|
824
|
+
A list of `FlowfileColumn` objects describing the schema.
|
|
825
|
+
"""
|
|
739
826
|
if self.number_of_fields == 0:
|
|
740
827
|
return []
|
|
741
828
|
if self._schema is None or (self._calculate_schema_stats and not self.ind_schema_calculated):
|
|
@@ -749,17 +836,28 @@ class FlowDataEngine:
|
|
|
749
836
|
|
|
750
837
|
@property
|
|
751
838
|
def number_of_fields(self) -> int:
|
|
752
|
-
"""
|
|
839
|
+
"""The number of columns (fields) in the DataFrame.
|
|
840
|
+
|
|
841
|
+
Returns:
|
|
842
|
+
The integer count of columns.
|
|
843
|
+
"""
|
|
753
844
|
if self.__number_of_fields is None:
|
|
754
845
|
self.__number_of_fields = len(self.columns)
|
|
755
846
|
return self.__number_of_fields
|
|
756
847
|
|
|
757
|
-
# Data Collection and Sampling Methods
|
|
758
|
-
|
|
759
848
|
def collect(self, n_records: int = None) -> pl.DataFrame:
|
|
760
|
-
"""
|
|
761
|
-
|
|
762
|
-
|
|
849
|
+
"""Collects the data and returns it as a Polars DataFrame.
|
|
850
|
+
|
|
851
|
+
This method triggers the execution of the lazy query plan (if applicable)
|
|
852
|
+
and returns the result. It supports streaming to optimize memory usage
|
|
853
|
+
for large datasets.
|
|
854
|
+
|
|
855
|
+
Args:
|
|
856
|
+
n_records: The maximum number of records to collect. If None, all
|
|
857
|
+
records are collected.
|
|
858
|
+
|
|
859
|
+
Returns:
|
|
860
|
+
A Polars `DataFrame` containing the collected data.
|
|
763
861
|
"""
|
|
764
862
|
if n_records is None:
|
|
765
863
|
logger.info(f'Fetching all data for Table object "{id(self)}". Settings: streaming={self._streamable}')
|
|
@@ -777,7 +875,7 @@ class FlowDataEngine:
|
|
|
777
875
|
return self._handle_collection_error(n_records)
|
|
778
876
|
|
|
779
877
|
def _collect_data(self, n_records: int = None) -> pl.DataFrame:
|
|
780
|
-
"""Internal method to handle data collection."""
|
|
878
|
+
"""Internal method to handle data collection logic."""
|
|
781
879
|
if n_records is None:
|
|
782
880
|
|
|
783
881
|
self.collect_external()
|
|
@@ -799,7 +897,7 @@ class FlowDataEngine:
|
|
|
799
897
|
return self.data_frame.head(n_records).collect()
|
|
800
898
|
|
|
801
899
|
def _collect_from_external_source(self, n_records: int) -> pl.DataFrame:
|
|
802
|
-
"""
|
|
900
|
+
"""Handles collection from an external source."""
|
|
803
901
|
if self.external_source.get_pl_df() is not None:
|
|
804
902
|
all_data = self.external_source.get_pl_df().head(n_records)
|
|
805
903
|
self.data_frame = all_data
|
|
@@ -809,7 +907,7 @@ class FlowDataEngine:
|
|
|
809
907
|
return self.data_frame
|
|
810
908
|
|
|
811
909
|
def _handle_collection_error(self, n_records: int) -> pl.DataFrame:
|
|
812
|
-
"""
|
|
910
|
+
"""Handles errors during collection by attempting partial collection."""
|
|
813
911
|
n_records = 100000000 if n_records is None else n_records
|
|
814
912
|
ok_cols, error_cols = self._identify_valid_columns(n_records)
|
|
815
913
|
|
|
@@ -818,7 +916,7 @@ class FlowDataEngine:
|
|
|
818
916
|
return self._create_empty_dataframe(n_records)
|
|
819
917
|
|
|
820
918
|
def _identify_valid_columns(self, n_records: int) -> Tuple[List[str], List[Tuple[str, Any]]]:
|
|
821
|
-
"""
|
|
919
|
+
"""Identifies which columns can be collected successfully."""
|
|
822
920
|
ok_cols = []
|
|
823
921
|
error_cols = []
|
|
824
922
|
for c in self.columns:
|
|
@@ -831,7 +929,7 @@ class FlowDataEngine:
|
|
|
831
929
|
|
|
832
930
|
def _create_partial_dataframe(self, ok_cols: List[str], error_cols: List[Tuple[str, Any]],
|
|
833
931
|
n_records: int) -> pl.DataFrame:
|
|
834
|
-
"""
|
|
932
|
+
"""Creates a DataFrame with partial data for columns that could be collected."""
|
|
835
933
|
df = self.data_frame.select(ok_cols)
|
|
836
934
|
df = df.with_columns([
|
|
837
935
|
pl.lit(None).alias(column_name).cast(data_type)
|
|
@@ -840,7 +938,7 @@ class FlowDataEngine:
|
|
|
840
938
|
return df.select(self.columns).head(n_records).collect()
|
|
841
939
|
|
|
842
940
|
def _create_empty_dataframe(self, n_records: int) -> pl.DataFrame:
|
|
843
|
-
"""
|
|
941
|
+
"""Creates an empty DataFrame with the correct schema."""
|
|
844
942
|
if self.number_of_records > 0:
|
|
845
943
|
return pl.DataFrame({
|
|
846
944
|
column_name: pl.Series(
|
|
@@ -851,11 +949,19 @@ class FlowDataEngine:
|
|
|
851
949
|
})
|
|
852
950
|
return pl.DataFrame(schema=self.data_frame.schema)
|
|
853
951
|
|
|
854
|
-
# Data Transformation Methods
|
|
855
|
-
|
|
856
952
|
def do_group_by(self, group_by_input: transform_schemas.GroupByInput,
|
|
857
953
|
calculate_schema_stats: bool = True) -> "FlowDataEngine":
|
|
858
|
-
"""
|
|
954
|
+
"""Performs a group-by operation on the DataFrame.
|
|
955
|
+
|
|
956
|
+
Args:
|
|
957
|
+
group_by_input: A `GroupByInput` object defining the grouping columns
|
|
958
|
+
and aggregations.
|
|
959
|
+
calculate_schema_stats: If True, calculates schema statistics for the
|
|
960
|
+
resulting DataFrame.
|
|
961
|
+
|
|
962
|
+
Returns:
|
|
963
|
+
A new `FlowDataEngine` instance with the grouped and aggregated data.
|
|
964
|
+
"""
|
|
859
965
|
aggregations = [c for c in group_by_input.agg_cols if c.agg != 'groupby']
|
|
860
966
|
group_columns = [c for c in group_by_input.agg_cols if c.agg == 'groupby']
|
|
861
967
|
|
|
@@ -877,7 +983,15 @@ class FlowDataEngine:
|
|
|
877
983
|
)
|
|
878
984
|
|
|
879
985
|
def do_sort(self, sorts: List[transform_schemas.SortByInput]) -> "FlowDataEngine":
|
|
880
|
-
"""
|
|
986
|
+
"""Sorts the DataFrame by one or more columns.
|
|
987
|
+
|
|
988
|
+
Args:
|
|
989
|
+
sorts: A list of `SortByInput` objects, each specifying a column
|
|
990
|
+
and sort direction ('asc' or 'desc').
|
|
991
|
+
|
|
992
|
+
Returns:
|
|
993
|
+
A new `FlowDataEngine` instance with the sorted data.
|
|
994
|
+
"""
|
|
881
995
|
if not sorts:
|
|
882
996
|
return self
|
|
883
997
|
|
|
@@ -887,7 +1001,16 @@ class FlowDataEngine:
|
|
|
887
1001
|
|
|
888
1002
|
def change_column_types(self, transforms: List[transform_schemas.SelectInput],
|
|
889
1003
|
calculate_schema: bool = False) -> "FlowDataEngine":
|
|
890
|
-
"""
|
|
1004
|
+
"""Changes the data type of one or more columns.
|
|
1005
|
+
|
|
1006
|
+
Args:
|
|
1007
|
+
transforms: A list of `SelectInput` objects, where each object specifies
|
|
1008
|
+
the column and its new `polars_type`.
|
|
1009
|
+
calculate_schema: If True, recalculates the schema after the type change.
|
|
1010
|
+
|
|
1011
|
+
Returns:
|
|
1012
|
+
A new `FlowDataEngine` instance with the updated column types.
|
|
1013
|
+
"""
|
|
891
1014
|
dtypes = [dtype.base_type() for dtype in self.data_frame.collect_schema().dtypes()]
|
|
892
1015
|
idx_mapping = list(
|
|
893
1016
|
(transform.old_name, self.cols_idx.get(transform.old_name), getattr(pl, transform.polars_type))
|
|
@@ -908,27 +1031,63 @@ class FlowDataEngine:
|
|
|
908
1031
|
streamable=self._streamable
|
|
909
1032
|
)
|
|
910
1033
|
|
|
911
|
-
# Data Export and Conversion Methods
|
|
912
|
-
|
|
913
1034
|
def save(self, path: str, data_type: str = 'parquet') -> Future:
|
|
914
|
-
"""
|
|
1035
|
+
"""Saves the DataFrame to a file in a separate thread.
|
|
1036
|
+
|
|
1037
|
+
Args:
|
|
1038
|
+
path: The file path to save to.
|
|
1039
|
+
data_type: The format to save in (e.g., 'parquet', 'csv').
|
|
1040
|
+
|
|
1041
|
+
Returns:
|
|
1042
|
+
A `loky.Future` object representing the asynchronous save operation.
|
|
1043
|
+
"""
|
|
915
1044
|
estimated_size = deepcopy(self.get_estimated_file_size() * 4)
|
|
916
1045
|
df = deepcopy(self.data_frame)
|
|
917
1046
|
return write_threaded(_df=df, path=path, data_type=data_type, estimated_size=estimated_size)
|
|
918
1047
|
|
|
919
1048
|
def to_pylist(self) -> List[Dict]:
|
|
920
|
-
"""
|
|
1049
|
+
"""Converts the DataFrame to a list of Python dictionaries.
|
|
1050
|
+
|
|
1051
|
+
Returns:
|
|
1052
|
+
A list where each item is a dictionary representing a row.
|
|
1053
|
+
"""
|
|
921
1054
|
if self.lazy:
|
|
922
1055
|
return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dicts()
|
|
923
1056
|
return self.data_frame.to_dicts()
|
|
924
1057
|
|
|
1058
|
+
def to_arrow(self) -> PaTable:
|
|
1059
|
+
"""Converts the DataFrame to a PyArrow Table.
|
|
1060
|
+
|
|
1061
|
+
This method triggers a `.collect()` call if the data is lazy,
|
|
1062
|
+
then converts the resulting eager DataFrame into a `pyarrow.Table`.
|
|
1063
|
+
|
|
1064
|
+
Returns:
|
|
1065
|
+
A `pyarrow.Table` instance representing the data.
|
|
1066
|
+
"""
|
|
1067
|
+
if self.lazy:
|
|
1068
|
+
return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_arrow()
|
|
1069
|
+
else:
|
|
1070
|
+
return self.data_frame.to_arrow()
|
|
1071
|
+
|
|
925
1072
|
def to_raw_data(self) -> input_schema.RawData:
|
|
926
|
-
"""
|
|
1073
|
+
"""Converts the DataFrame to a `RawData` schema object.
|
|
1074
|
+
|
|
1075
|
+
Returns:
|
|
1076
|
+
An `input_schema.RawData` object containing the schema and data.
|
|
1077
|
+
"""
|
|
927
1078
|
columns = [c.get_minimal_field_info() for c in self.schema]
|
|
928
1079
|
data = list(self.to_dict().values())
|
|
929
1080
|
return input_schema.RawData(columns=columns, data=data)
|
|
930
1081
|
|
|
931
1082
|
def to_dict(self) -> Dict[str, List]:
|
|
1083
|
+
"""Converts the DataFrame to a Python dictionary of columns.
|
|
1084
|
+
|
|
1085
|
+
Each key in the dictionary is a column name, and the corresponding value
|
|
1086
|
+
is a list of the data in that column.
|
|
1087
|
+
|
|
1088
|
+
Returns:
|
|
1089
|
+
A dictionary mapping column names to lists of their values.
|
|
1090
|
+
"""
|
|
932
1091
|
if self.lazy:
|
|
933
1092
|
return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
|
|
934
1093
|
else:
|
|
@@ -936,7 +1095,15 @@ class FlowDataEngine:
|
|
|
936
1095
|
|
|
937
1096
|
@classmethod
|
|
938
1097
|
def create_from_external_source(cls, external_source: ExternalDataSource) -> "FlowDataEngine":
|
|
939
|
-
"""
|
|
1098
|
+
"""Creates a FlowDataEngine from an external data source.
|
|
1099
|
+
|
|
1100
|
+
Args:
|
|
1101
|
+
external_source: An object that conforms to the `ExternalDataSource`
|
|
1102
|
+
interface.
|
|
1103
|
+
|
|
1104
|
+
Returns:
|
|
1105
|
+
A new `FlowDataEngine` instance.
|
|
1106
|
+
"""
|
|
940
1107
|
if external_source.schema is not None:
|
|
941
1108
|
ff = cls.create_from_schema(external_source.schema)
|
|
942
1109
|
elif external_source.initial_data_getter is not None:
|
|
@@ -948,12 +1115,27 @@ class FlowDataEngine:
|
|
|
948
1115
|
|
|
949
1116
|
@classmethod
|
|
950
1117
|
def create_from_sql(cls, sql: str, conn: Any) -> "FlowDataEngine":
|
|
951
|
-
"""
|
|
1118
|
+
"""Creates a FlowDataEngine by executing a SQL query.
|
|
1119
|
+
|
|
1120
|
+
Args:
|
|
1121
|
+
sql: The SQL query string to execute.
|
|
1122
|
+
conn: A database connection object or connection URI string.
|
|
1123
|
+
|
|
1124
|
+
Returns:
|
|
1125
|
+
A new `FlowDataEngine` instance with the query result.
|
|
1126
|
+
"""
|
|
952
1127
|
return cls(pl.read_sql(sql, conn))
|
|
953
1128
|
|
|
954
1129
|
@classmethod
|
|
955
1130
|
def create_from_schema(cls, schema: List[FlowfileColumn]) -> "FlowDataEngine":
|
|
956
|
-
"""
|
|
1131
|
+
"""Creates an empty FlowDataEngine from a schema definition.
|
|
1132
|
+
|
|
1133
|
+
Args:
|
|
1134
|
+
schema: A list of `FlowfileColumn` objects defining the schema.
|
|
1135
|
+
|
|
1136
|
+
Returns:
|
|
1137
|
+
A new, empty `FlowDataEngine` instance with the specified schema.
|
|
1138
|
+
"""
|
|
957
1139
|
pl_schema = []
|
|
958
1140
|
for i, flow_file_column in enumerate(schema):
|
|
959
1141
|
pl_schema.append((flow_file_column.name, cast_str_to_polars_type(flow_file_column.data_type)))
|
|
@@ -963,7 +1145,17 @@ class FlowDataEngine:
|
|
|
963
1145
|
|
|
964
1146
|
@classmethod
|
|
965
1147
|
def create_from_path(cls, received_table: input_schema.ReceivedTableBase) -> "FlowDataEngine":
|
|
966
|
-
"""
|
|
1148
|
+
"""Creates a FlowDataEngine from a local file path.
|
|
1149
|
+
|
|
1150
|
+
Supports various file types like CSV, Parquet, and Excel.
|
|
1151
|
+
|
|
1152
|
+
Args:
|
|
1153
|
+
received_table: A `ReceivedTableBase` object containing the file path
|
|
1154
|
+
and format details.
|
|
1155
|
+
|
|
1156
|
+
Returns:
|
|
1157
|
+
A new `FlowDataEngine` instance with data from the file.
|
|
1158
|
+
"""
|
|
967
1159
|
received_table.set_absolute_filepath()
|
|
968
1160
|
file_type_handlers = {
|
|
969
1161
|
'csv': create_funcs.create_from_path_csv,
|
|
@@ -981,19 +1173,36 @@ class FlowDataEngine:
|
|
|
981
1173
|
|
|
982
1174
|
@classmethod
|
|
983
1175
|
def create_random(cls, number_of_records: int = 1000) -> "FlowDataEngine":
|
|
984
|
-
"""
|
|
1176
|
+
"""Creates a FlowDataEngine with randomly generated data.
|
|
1177
|
+
|
|
1178
|
+
Useful for testing and examples.
|
|
1179
|
+
|
|
1180
|
+
Args:
|
|
1181
|
+
number_of_records: The number of random records to generate.
|
|
1182
|
+
|
|
1183
|
+
Returns:
|
|
1184
|
+
A new `FlowDataEngine` instance with fake data.
|
|
1185
|
+
"""
|
|
985
1186
|
return cls(create_fake_data(number_of_records))
|
|
986
1187
|
|
|
987
1188
|
@classmethod
|
|
988
1189
|
def generate_enumerator(cls, length: int = 1000, output_name: str = 'output_column') -> "FlowDataEngine":
|
|
989
|
-
"""
|
|
1190
|
+
"""Generates a FlowDataEngine with a single column containing a sequence of integers.
|
|
1191
|
+
|
|
1192
|
+
Args:
|
|
1193
|
+
length: The number of integers to generate in the sequence.
|
|
1194
|
+
output_name: The name of the output column.
|
|
1195
|
+
|
|
1196
|
+
Returns:
|
|
1197
|
+
A new `FlowDataEngine` instance.
|
|
1198
|
+
"""
|
|
990
1199
|
if length > 10_000_000:
|
|
991
1200
|
length = 10_000_000
|
|
992
1201
|
return cls(pl.LazyFrame().select((pl.int_range(0, length, dtype=pl.UInt32)).alias(output_name)))
|
|
993
1202
|
|
|
994
1203
|
def _handle_schema(self, schema: List[FlowfileColumn] | List[str] | pl.Schema | None,
|
|
995
1204
|
pl_schema: pl.Schema) -> List[FlowfileColumn] | None:
|
|
996
|
-
"""
|
|
1205
|
+
"""Handles schema processing and validation during initialization."""
|
|
997
1206
|
if schema is None and pl_schema is not None:
|
|
998
1207
|
return convert_stats_to_column_info(self._create_schema_stats_from_pl_schema(pl_schema))
|
|
999
1208
|
elif schema is None and pl_schema is None:
|
|
@@ -1013,7 +1222,7 @@ class FlowDataEngine:
|
|
|
1013
1222
|
return schema
|
|
1014
1223
|
|
|
1015
1224
|
def _handle_polars_schema(self, schema: pl.Schema, pl_schema: pl.Schema) -> List[FlowfileColumn]:
|
|
1016
|
-
"""
|
|
1225
|
+
"""Handles Polars schema conversion."""
|
|
1017
1226
|
flow_file_columns = [
|
|
1018
1227
|
FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
|
|
1019
1228
|
for col_name, dtype in zip(schema.names(), schema.dtypes())
|
|
@@ -1028,7 +1237,7 @@ class FlowDataEngine:
|
|
|
1028
1237
|
return flow_file_columns
|
|
1029
1238
|
|
|
1030
1239
|
def _handle_string_schema(self, schema: List[str], pl_schema: pl.Schema) -> List[FlowfileColumn]:
|
|
1031
|
-
"""
|
|
1240
|
+
"""Handles string-based schema conversion."""
|
|
1032
1241
|
flow_file_columns = [
|
|
1033
1242
|
FlowfileColumn.create_from_polars_dtype(column_name=col_name, data_type=dtype)
|
|
1034
1243
|
for col_name, dtype in zip(schema, pl_schema.dtypes())
|
|
@@ -1040,10 +1249,19 @@ class FlowDataEngine:
|
|
|
1040
1249
|
|
|
1041
1250
|
return flow_file_columns
|
|
1042
1251
|
|
|
1043
|
-
# Data Manipulation Methods
|
|
1044
|
-
|
|
1045
1252
|
def split(self, split_input: transform_schemas.TextToRowsInput) -> "FlowDataEngine":
|
|
1046
|
-
"""
|
|
1253
|
+
"""Splits a column's text values into multiple rows based on a delimiter.
|
|
1254
|
+
|
|
1255
|
+
This operation is often referred to as "exploding" the DataFrame, as it
|
|
1256
|
+
increases the number of rows.
|
|
1257
|
+
|
|
1258
|
+
Args:
|
|
1259
|
+
split_input: A `TextToRowsInput` object specifying the column to split,
|
|
1260
|
+
the delimiter, and the output column name.
|
|
1261
|
+
|
|
1262
|
+
Returns:
|
|
1263
|
+
A new `FlowDataEngine` instance with the exploded rows.
|
|
1264
|
+
"""
|
|
1047
1265
|
output_column_name = (
|
|
1048
1266
|
split_input.output_column_name
|
|
1049
1267
|
if split_input.output_column_name
|
|
@@ -1068,7 +1286,18 @@ class FlowDataEngine:
|
|
|
1068
1286
|
return FlowDataEngine(df)
|
|
1069
1287
|
|
|
1070
1288
|
def unpivot(self, unpivot_input: transform_schemas.UnpivotInput) -> "FlowDataEngine":
|
|
1071
|
-
"""
|
|
1289
|
+
"""Converts the DataFrame from a wide to a long format.
|
|
1290
|
+
|
|
1291
|
+
This is the inverse of a pivot operation, taking columns and transforming
|
|
1292
|
+
them into `variable` and `value` rows.
|
|
1293
|
+
|
|
1294
|
+
Args:
|
|
1295
|
+
unpivot_input: An `UnpivotInput` object specifying which columns to
|
|
1296
|
+
unpivot and which to keep as index columns.
|
|
1297
|
+
|
|
1298
|
+
Returns:
|
|
1299
|
+
A new, unpivoted `FlowDataEngine` instance.
|
|
1300
|
+
"""
|
|
1072
1301
|
lf = self.data_frame
|
|
1073
1302
|
|
|
1074
1303
|
if unpivot_input.data_type_selector_expr is not None:
|
|
@@ -1087,7 +1316,17 @@ class FlowDataEngine:
|
|
|
1087
1316
|
return FlowDataEngine(result)
|
|
1088
1317
|
|
|
1089
1318
|
def do_pivot(self, pivot_input: transform_schemas.PivotInput, node_logger: NodeLogger = None) -> "FlowDataEngine":
|
|
1090
|
-
"""
|
|
1319
|
+
"""Converts the DataFrame from a long to a wide format, aggregating values.
|
|
1320
|
+
|
|
1321
|
+
Args:
|
|
1322
|
+
pivot_input: A `PivotInput` object defining the index, pivot, and value
|
|
1323
|
+
columns, along with the aggregation logic.
|
|
1324
|
+
node_logger: An optional logger for reporting warnings, e.g., if the
|
|
1325
|
+
pivot column has too many unique values.
|
|
1326
|
+
|
|
1327
|
+
Returns:
|
|
1328
|
+
A new, pivoted `FlowDataEngine` instance.
|
|
1329
|
+
"""
|
|
1091
1330
|
# Get unique values for pivot columns
|
|
1092
1331
|
max_unique_vals = 200
|
|
1093
1332
|
new_cols_unique = fetch_unique_values(self.data_frame.select(pivot_input.pivot_column)
|
|
@@ -1147,7 +1386,16 @@ class FlowDataEngine:
|
|
|
1147
1386
|
return FlowDataEngine(df, calculate_schema_stats=False)
|
|
1148
1387
|
|
|
1149
1388
|
def do_filter(self, predicate: str) -> "FlowDataEngine":
|
|
1150
|
-
"""
|
|
1389
|
+
"""Filters rows based on a predicate expression.
|
|
1390
|
+
|
|
1391
|
+
Args:
|
|
1392
|
+
predicate: A string containing a Polars expression that evaluates to
|
|
1393
|
+
a boolean value.
|
|
1394
|
+
|
|
1395
|
+
Returns:
|
|
1396
|
+
A new `FlowDataEngine` instance containing only the rows that match
|
|
1397
|
+
the predicate.
|
|
1398
|
+
"""
|
|
1151
1399
|
try:
|
|
1152
1400
|
f = to_expr(predicate)
|
|
1153
1401
|
except Exception as e:
|
|
@@ -1157,13 +1405,24 @@ class FlowDataEngine:
|
|
|
1157
1405
|
return FlowDataEngine(df, schema=self.schema, streamable=self._streamable)
|
|
1158
1406
|
|
|
1159
1407
|
def add_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
|
|
1160
|
-
"""
|
|
1408
|
+
"""Adds a record ID (row number) column to the DataFrame.
|
|
1409
|
+
|
|
1410
|
+
Can generate a simple sequential ID or a grouped ID that resets for
|
|
1411
|
+
each group.
|
|
1412
|
+
|
|
1413
|
+
Args:
|
|
1414
|
+
record_id_settings: A `RecordIdInput` object specifying the output
|
|
1415
|
+
column name, offset, and optional grouping columns.
|
|
1416
|
+
|
|
1417
|
+
Returns:
|
|
1418
|
+
A new `FlowDataEngine` instance with the added record ID column.
|
|
1419
|
+
"""
|
|
1161
1420
|
if record_id_settings.group_by and len(record_id_settings.group_by_columns) > 0:
|
|
1162
1421
|
return self._add_grouped_record_id(record_id_settings)
|
|
1163
1422
|
return self._add_simple_record_id(record_id_settings)
|
|
1164
1423
|
|
|
1165
1424
|
def _add_grouped_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
|
|
1166
|
-
"""
|
|
1425
|
+
"""Adds a record ID column with grouping."""
|
|
1167
1426
|
select_cols = [pl.col(record_id_settings.output_column_name)] + [pl.col(c) for c in self.columns]
|
|
1168
1427
|
|
|
1169
1428
|
df = (
|
|
@@ -1183,7 +1442,7 @@ class FlowDataEngine:
|
|
|
1183
1442
|
return FlowDataEngine(df, schema=output_schema)
|
|
1184
1443
|
|
|
1185
1444
|
def _add_simple_record_id(self, record_id_settings: transform_schemas.RecordIdInput) -> "FlowDataEngine":
|
|
1186
|
-
"""
|
|
1445
|
+
"""Adds a simple sequential record ID column."""
|
|
1187
1446
|
df = self.data_frame.with_row_index(
|
|
1188
1447
|
record_id_settings.output_column_name,
|
|
1189
1448
|
record_id_settings.offset
|
|
@@ -1194,38 +1453,52 @@ class FlowDataEngine:
|
|
|
1194
1453
|
|
|
1195
1454
|
return FlowDataEngine(df, schema=output_schema)
|
|
1196
1455
|
|
|
1197
|
-
# Utility Methods
|
|
1198
|
-
|
|
1199
1456
|
def get_schema_column(self, col_name: str) -> FlowfileColumn:
|
|
1200
|
-
"""
|
|
1457
|
+
"""Retrieves the schema information for a single column by its name.
|
|
1458
|
+
|
|
1459
|
+
Args:
|
|
1460
|
+
col_name: The name of the column to retrieve.
|
|
1461
|
+
|
|
1462
|
+
Returns:
|
|
1463
|
+
A `FlowfileColumn` object for the specified column, or `None` if not found.
|
|
1464
|
+
"""
|
|
1201
1465
|
for s in self.schema:
|
|
1202
1466
|
if s.name == col_name:
|
|
1203
1467
|
return s
|
|
1204
1468
|
|
|
1205
1469
|
def get_estimated_file_size(self) -> int:
|
|
1206
|
-
"""
|
|
1470
|
+
"""Estimates the file size in bytes if the data originated from a local file.
|
|
1471
|
+
|
|
1472
|
+
This relies on the original path being tracked during file ingestion.
|
|
1473
|
+
|
|
1474
|
+
Returns:
|
|
1475
|
+
The file size in bytes, or 0 if the original path is unknown.
|
|
1476
|
+
"""
|
|
1207
1477
|
if self._org_path is not None:
|
|
1208
1478
|
return os.path.getsize(self._org_path)
|
|
1209
1479
|
return 0
|
|
1210
1480
|
|
|
1211
1481
|
def __repr__(self) -> str:
|
|
1212
|
-
"""
|
|
1213
|
-
return f'
|
|
1482
|
+
"""Returns a string representation of the FlowDataEngine."""
|
|
1483
|
+
return f'flow data engine\n{self.data_frame.__repr__()}'
|
|
1214
1484
|
|
|
1215
1485
|
def __call__(self) -> "FlowDataEngine":
|
|
1216
|
-
"""
|
|
1486
|
+
"""Makes the class instance callable, returning itself."""
|
|
1217
1487
|
return self
|
|
1218
1488
|
|
|
1219
1489
|
def __len__(self) -> int:
|
|
1220
|
-
"""
|
|
1490
|
+
"""Returns the number of records in the table."""
|
|
1221
1491
|
return self.number_of_records if self.number_of_records >= 0 else self.get_number_of_records()
|
|
1222
1492
|
|
|
1223
1493
|
def cache(self) -> "FlowDataEngine":
|
|
1224
|
-
"""
|
|
1225
|
-
|
|
1494
|
+
"""Caches the current DataFrame to disk and updates the internal reference.
|
|
1495
|
+
|
|
1496
|
+
This triggers a background process to write the current LazyFrame's result
|
|
1497
|
+
to a temporary file. Subsequent operations on this `FlowDataEngine` instance
|
|
1498
|
+
will read from the cached file, which can speed up downstream computations.
|
|
1226
1499
|
|
|
1227
1500
|
Returns:
|
|
1228
|
-
FlowDataEngine
|
|
1501
|
+
The same `FlowDataEngine` instance, now backed by the cached data.
|
|
1229
1502
|
"""
|
|
1230
1503
|
edf = ExternalDfFetcher(lf=self.data_frame, file_ref=str(id(self)), wait_on_completion=False,
|
|
1231
1504
|
flow_id=-1,
|
|
@@ -1240,7 +1513,13 @@ class FlowDataEngine:
|
|
|
1240
1513
|
return self
|
|
1241
1514
|
|
|
1242
1515
|
def collect_external(self):
|
|
1243
|
-
"""
|
|
1516
|
+
"""Materializes data from a tracked external source.
|
|
1517
|
+
|
|
1518
|
+
If the `FlowDataEngine` was created from an `ExternalDataSource`, this
|
|
1519
|
+
method will trigger the data retrieval, update the internal `_data_frame`
|
|
1520
|
+
to a `LazyFrame` of the collected data, and reset the schema to be
|
|
1521
|
+
re-evaluated.
|
|
1522
|
+
"""
|
|
1244
1523
|
if self._external_source is not None:
|
|
1245
1524
|
logger.info('Collecting external source')
|
|
1246
1525
|
if self.external_source.get_pl_df() is not None:
|
|
@@ -1249,16 +1528,16 @@ class FlowDataEngine:
|
|
|
1249
1528
|
self.data_frame = pl.LazyFrame(list(self.external_source.get_iter()))
|
|
1250
1529
|
self._schema = None # enforce reset schema
|
|
1251
1530
|
|
|
1252
|
-
# Data Access Methods
|
|
1253
1531
|
def get_output_sample(self, n_rows: int = 10) -> List[Dict]:
|
|
1254
|
-
"""
|
|
1255
|
-
|
|
1532
|
+
"""Gets a sample of the data as a list of dictionaries.
|
|
1533
|
+
|
|
1534
|
+
This is typically used to display a preview of the data in a UI.
|
|
1256
1535
|
|
|
1257
1536
|
Args:
|
|
1258
|
-
n_rows:
|
|
1537
|
+
n_rows: The number of rows to sample.
|
|
1259
1538
|
|
|
1260
1539
|
Returns:
|
|
1261
|
-
|
|
1540
|
+
A list of dictionaries, where each dictionary represents a row.
|
|
1262
1541
|
"""
|
|
1263
1542
|
if self.number_of_records > n_rows or self.number_of_records < 0:
|
|
1264
1543
|
df = self.collect(n_rows)
|
|
@@ -1267,6 +1546,7 @@ class FlowDataEngine:
|
|
|
1267
1546
|
return df.to_dicts()
|
|
1268
1547
|
|
|
1269
1548
|
def __get_sample__(self, n_rows: int = 100, streamable: bool = True) -> "FlowDataEngine":
|
|
1549
|
+
"""Internal method to get a sample of the data."""
|
|
1270
1550
|
if not self.lazy:
|
|
1271
1551
|
df = self.data_frame.lazy()
|
|
1272
1552
|
else:
|
|
@@ -1284,20 +1564,20 @@ class FlowDataEngine:
|
|
|
1284
1564
|
|
|
1285
1565
|
def get_sample(self, n_rows: int = 100, random: bool = False, shuffle: bool = False,
|
|
1286
1566
|
seed: int = None) -> "FlowDataEngine":
|
|
1287
|
-
"""
|
|
1288
|
-
Get a sample of rows from the DataFrame.
|
|
1567
|
+
"""Gets a sample of rows from the DataFrame.
|
|
1289
1568
|
|
|
1290
1569
|
Args:
|
|
1291
|
-
n_rows:
|
|
1292
|
-
random:
|
|
1293
|
-
shuffle:
|
|
1294
|
-
seed:
|
|
1570
|
+
n_rows: The number of rows to sample.
|
|
1571
|
+
random: If True, performs random sampling. If False, takes the first n_rows.
|
|
1572
|
+
shuffle: If True (and `random` is True), shuffles the data before sampling.
|
|
1573
|
+
seed: A random seed for reproducibility.
|
|
1295
1574
|
|
|
1296
1575
|
Returns:
|
|
1297
|
-
FlowDataEngine
|
|
1576
|
+
A new `FlowDataEngine` instance containing the sampled data.
|
|
1298
1577
|
"""
|
|
1299
|
-
n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=
|
|
1578
|
+
n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=OFFLOAD_TO_WORKER))
|
|
1300
1579
|
logging.info(f'Getting sample of {n_rows} rows')
|
|
1580
|
+
|
|
1301
1581
|
if random:
|
|
1302
1582
|
if self.lazy and self.external_source is not None:
|
|
1303
1583
|
self.collect_external()
|
|
@@ -1319,31 +1599,30 @@ class FlowDataEngine:
|
|
|
1319
1599
|
return FlowDataEngine(sample_df, schema=self.schema, number_of_records=n_records)
|
|
1320
1600
|
|
|
1321
1601
|
def get_subset(self, n_rows: int = 100) -> "FlowDataEngine":
|
|
1322
|
-
"""
|
|
1323
|
-
Get a subset of rows from the DataFrame.
|
|
1602
|
+
"""Gets the first `n_rows` from the DataFrame.
|
|
1324
1603
|
|
|
1325
1604
|
Args:
|
|
1326
|
-
n_rows:
|
|
1605
|
+
n_rows: The number of rows to include in the subset.
|
|
1327
1606
|
|
|
1328
1607
|
Returns:
|
|
1329
|
-
FlowDataEngine
|
|
1608
|
+
A new `FlowDataEngine` instance containing the subset of data.
|
|
1330
1609
|
"""
|
|
1331
1610
|
if not self.lazy:
|
|
1332
1611
|
return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
|
|
1333
1612
|
else:
|
|
1334
1613
|
return FlowDataEngine(self.data_frame.head(n_rows), calculate_schema_stats=True)
|
|
1335
1614
|
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
"""
|
|
1339
|
-
Iterate over the DataFrame in batches.
|
|
1615
|
+
def iter_batches(self, batch_size: int = 1000,
|
|
1616
|
+
columns: Union[List, Tuple, str] = None) -> Generator["FlowDataEngine", None, None]:
|
|
1617
|
+
"""Iterates over the DataFrame in batches.
|
|
1340
1618
|
|
|
1341
1619
|
Args:
|
|
1342
|
-
batch_size:
|
|
1343
|
-
columns:
|
|
1620
|
+
batch_size: The size of each batch.
|
|
1621
|
+
columns: A list of column names to include in the batches. If None,
|
|
1622
|
+
all columns are included.
|
|
1344
1623
|
|
|
1345
1624
|
Yields:
|
|
1346
|
-
FlowDataEngine
|
|
1625
|
+
A `FlowDataEngine` instance for each batch.
|
|
1347
1626
|
"""
|
|
1348
1627
|
if columns:
|
|
1349
1628
|
self.data_frame = self.data_frame.select(columns)
|
|
@@ -1355,17 +1634,21 @@ class FlowDataEngine:
|
|
|
1355
1634
|
def start_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1356
1635
|
other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
|
|
1357
1636
|
node_id: int | str = -1) -> ExternalFuzzyMatchFetcher:
|
|
1358
|
-
"""
|
|
1359
|
-
|
|
1637
|
+
"""Starts a fuzzy join operation in a background process.
|
|
1638
|
+
|
|
1639
|
+
This method prepares the data and initiates the fuzzy matching in a
|
|
1640
|
+
separate process, returning a tracker object immediately.
|
|
1360
1641
|
|
|
1361
1642
|
Args:
|
|
1362
|
-
fuzzy_match_input:
|
|
1363
|
-
other:
|
|
1364
|
-
file_ref:
|
|
1365
|
-
flow_id:
|
|
1366
|
-
node_id:
|
|
1643
|
+
fuzzy_match_input: A `FuzzyMatchInput` object with the matching parameters.
|
|
1644
|
+
other: The right `FlowDataEngine` to join with.
|
|
1645
|
+
file_ref: A reference string for temporary files.
|
|
1646
|
+
flow_id: The flow ID for tracking.
|
|
1647
|
+
node_id: The node ID for tracking.
|
|
1648
|
+
|
|
1367
1649
|
Returns:
|
|
1368
|
-
|
|
1650
|
+
An `ExternalFuzzyMatchFetcher` object that can be used to track the
|
|
1651
|
+
progress and retrieve the result of the fuzzy join.
|
|
1369
1652
|
"""
|
|
1370
1653
|
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
|
|
1371
1654
|
fuzzy_match_input=fuzzy_match_input)
|
|
@@ -1379,17 +1662,19 @@ class FlowDataEngine:
|
|
|
1379
1662
|
def do_fuzzy_join(self, fuzzy_match_input: transform_schemas.FuzzyMatchInput,
|
|
1380
1663
|
other: "FlowDataEngine", file_ref: str, flow_id: int = -1,
|
|
1381
1664
|
node_id: int | str = -1) -> "FlowDataEngine":
|
|
1382
|
-
"""
|
|
1383
|
-
|
|
1665
|
+
"""Performs a fuzzy join with another DataFrame.
|
|
1666
|
+
|
|
1667
|
+
This method blocks until the fuzzy join operation is complete.
|
|
1384
1668
|
|
|
1385
1669
|
Args:
|
|
1386
|
-
fuzzy_match_input:
|
|
1387
|
-
other:
|
|
1388
|
-
file_ref:
|
|
1389
|
-
flow_id:
|
|
1390
|
-
node_id:
|
|
1670
|
+
fuzzy_match_input: A `FuzzyMatchInput` object with the matching parameters.
|
|
1671
|
+
other: The right `FlowDataEngine` to join with.
|
|
1672
|
+
file_ref: A reference string for temporary files.
|
|
1673
|
+
flow_id: The flow ID for tracking.
|
|
1674
|
+
node_id: The node ID for tracking.
|
|
1675
|
+
|
|
1391
1676
|
Returns:
|
|
1392
|
-
FlowDataEngine
|
|
1677
|
+
A new `FlowDataEngine` instance with the result of the fuzzy join.
|
|
1393
1678
|
"""
|
|
1394
1679
|
left_df, right_df = prepare_for_fuzzy_match(left=self, right=other,
|
|
1395
1680
|
fuzzy_match_input=fuzzy_match_input)
|
|
@@ -1403,18 +1688,19 @@ class FlowDataEngine:
|
|
|
1403
1688
|
|
|
1404
1689
|
def fuzzy_match(self, right: "FlowDataEngine", left_on: str, right_on: str,
|
|
1405
1690
|
fuzzy_method: str = 'levenshtein', threshold: float = 0.75) -> "FlowDataEngine":
|
|
1406
|
-
"""
|
|
1407
|
-
|
|
1691
|
+
"""Performs a simple fuzzy match between two DataFrames on a single column pair.
|
|
1692
|
+
|
|
1693
|
+
This is a convenience method for a common fuzzy join scenario.
|
|
1408
1694
|
|
|
1409
1695
|
Args:
|
|
1410
|
-
right:
|
|
1411
|
-
left_on:
|
|
1412
|
-
right_on:
|
|
1413
|
-
fuzzy_method:
|
|
1414
|
-
threshold:
|
|
1696
|
+
right: The right `FlowDataEngine` to match against.
|
|
1697
|
+
left_on: The column name from the left DataFrame to match on.
|
|
1698
|
+
right_on: The column name from the right DataFrame to match on.
|
|
1699
|
+
fuzzy_method: The fuzzy matching algorithm to use (e.g., 'levenshtein').
|
|
1700
|
+
threshold: The similarity score threshold (0.0 to 1.0) for a match.
|
|
1415
1701
|
|
|
1416
1702
|
Returns:
|
|
1417
|
-
|
|
1703
|
+
A new `FlowDataEngine` with the matched data.
|
|
1418
1704
|
"""
|
|
1419
1705
|
fuzzy_match_input = transform_schemas.FuzzyMatchInput(
|
|
1420
1706
|
[transform_schemas.FuzzyMap(
|
|
@@ -1430,29 +1716,28 @@ class FlowDataEngine:
|
|
|
1430
1716
|
def do_cross_join(self, cross_join_input: transform_schemas.CrossJoinInput,
|
|
1431
1717
|
auto_generate_selection: bool, verify_integrity: bool,
|
|
1432
1718
|
other: "FlowDataEngine") -> "FlowDataEngine":
|
|
1433
|
-
"""
|
|
1434
|
-
|
|
1719
|
+
"""Performs a cross join with another DataFrame.
|
|
1720
|
+
|
|
1721
|
+
A cross join produces the Cartesian product of the two DataFrames.
|
|
1435
1722
|
|
|
1436
1723
|
Args:
|
|
1437
|
-
cross_join_input:
|
|
1438
|
-
auto_generate_selection:
|
|
1439
|
-
verify_integrity:
|
|
1440
|
-
other:
|
|
1724
|
+
cross_join_input: A `CrossJoinInput` object specifying column selections.
|
|
1725
|
+
auto_generate_selection: If True, automatically renames columns to avoid conflicts.
|
|
1726
|
+
verify_integrity: If True, checks if the resulting join would be too large.
|
|
1727
|
+
other: The right `FlowDataEngine` to join with.
|
|
1441
1728
|
|
|
1442
1729
|
Returns:
|
|
1443
|
-
|
|
1730
|
+
A new `FlowDataEngine` with the result of the cross join.
|
|
1444
1731
|
|
|
1445
1732
|
Raises:
|
|
1446
|
-
Exception: If join would result in
|
|
1733
|
+
Exception: If `verify_integrity` is True and the join would result in
|
|
1734
|
+
an excessively large number of records.
|
|
1447
1735
|
"""
|
|
1448
1736
|
self.lazy = True
|
|
1449
1737
|
other.lazy = True
|
|
1450
1738
|
|
|
1451
1739
|
verify_join_select_integrity(cross_join_input, left_columns=self.columns, right_columns=other.columns)
|
|
1452
1740
|
|
|
1453
|
-
# if auto_generate_selection:
|
|
1454
|
-
# cross_join_input.auto_rename()
|
|
1455
|
-
|
|
1456
1741
|
right_select = [v.old_name for v in cross_join_input.right_select.renames
|
|
1457
1742
|
if (v.keep or v.join_key) and v.is_available]
|
|
1458
1743
|
left_select = [v.old_name for v in cross_join_input.left_select.renames
|
|
@@ -1484,31 +1769,32 @@ class FlowDataEngine:
|
|
|
1484
1769
|
|
|
1485
1770
|
def join(self, join_input: transform_schemas.JoinInput, auto_generate_selection: bool,
|
|
1486
1771
|
verify_integrity: bool, other: "FlowDataEngine") -> "FlowDataEngine":
|
|
1487
|
-
"""
|
|
1488
|
-
|
|
1772
|
+
"""Performs a standard SQL-style join with another DataFrame.
|
|
1773
|
+
|
|
1774
|
+
Supports various join types like 'inner', 'left', 'right', 'outer', 'semi', and 'anti'.
|
|
1489
1775
|
|
|
1490
1776
|
Args:
|
|
1491
|
-
join_input:
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1777
|
+
join_input: A `JoinInput` object defining the join keys, join type,
|
|
1778
|
+
and column selections.
|
|
1779
|
+
auto_generate_selection: If True, automatically handles column renaming.
|
|
1780
|
+
verify_integrity: If True, performs checks to prevent excessively large joins.
|
|
1781
|
+
other: The right `FlowDataEngine` to join with.
|
|
1495
1782
|
|
|
1496
1783
|
Returns:
|
|
1497
|
-
|
|
1784
|
+
A new `FlowDataEngine` with the joined data.
|
|
1498
1785
|
|
|
1499
1786
|
Raises:
|
|
1500
|
-
Exception: If join
|
|
1787
|
+
Exception: If the join configuration is invalid or if `verify_integrity`
|
|
1788
|
+
is True and the join is predicted to be too large.
|
|
1501
1789
|
"""
|
|
1502
1790
|
ensure_right_unselect_for_semi_and_anti_joins(join_input)
|
|
1503
1791
|
verify_join_select_integrity(join_input, left_columns=self.columns, right_columns=other.columns)
|
|
1504
1792
|
if not verify_join_map_integrity(join_input, left_columns=self.schema, right_columns=other.schema):
|
|
1505
1793
|
raise Exception('Join is not valid by the data fields')
|
|
1506
|
-
|
|
1507
1794
|
if auto_generate_selection:
|
|
1508
1795
|
join_input.auto_rename()
|
|
1509
1796
|
left = self.data_frame.select(get_select_columns(join_input.left_select.renames)).rename(join_input.left_select.rename_table)
|
|
1510
1797
|
right = other.data_frame.select(get_select_columns(join_input.right_select.renames)).rename(join_input.right_select.rename_table)
|
|
1511
|
-
|
|
1512
1798
|
if verify_integrity and join_input.how != 'right':
|
|
1513
1799
|
n_records = get_join_count(left, right, left_on_keys=join_input.left_join_keys,
|
|
1514
1800
|
right_on_keys=join_input.right_join_keys, how=join_input.how)
|
|
@@ -1554,16 +1840,17 @@ class FlowDataEngine:
|
|
|
1554
1840
|
number_of_records=0, streamable=False)
|
|
1555
1841
|
return fl
|
|
1556
1842
|
|
|
1557
|
-
# Graph Operations
|
|
1558
1843
|
def solve_graph(self, graph_solver_input: transform_schemas.GraphSolverInput) -> "FlowDataEngine":
|
|
1559
|
-
"""
|
|
1560
|
-
|
|
1844
|
+
"""Solves a graph problem represented by 'from' and 'to' columns.
|
|
1845
|
+
|
|
1846
|
+
This is used for operations like finding connected components in a graph.
|
|
1561
1847
|
|
|
1562
1848
|
Args:
|
|
1563
|
-
graph_solver_input:
|
|
1849
|
+
graph_solver_input: A `GraphSolverInput` object defining the source,
|
|
1850
|
+
destination, and output column names.
|
|
1564
1851
|
|
|
1565
1852
|
Returns:
|
|
1566
|
-
FlowDataEngine
|
|
1853
|
+
A new `FlowDataEngine` instance with the solved graph data.
|
|
1567
1854
|
"""
|
|
1568
1855
|
lf = self.data_frame.with_columns(
|
|
1569
1856
|
graph_solver(graph_solver_input.col_from, graph_solver_input.col_to)
|
|
@@ -1571,42 +1858,41 @@ class FlowDataEngine:
|
|
|
1571
1858
|
)
|
|
1572
1859
|
return FlowDataEngine(lf)
|
|
1573
1860
|
|
|
1574
|
-
# Data Modification Methods
|
|
1575
1861
|
def add_new_values(self, values: Iterable, col_name: str = None) -> "FlowDataEngine":
|
|
1576
|
-
"""
|
|
1577
|
-
Add a new column with specified values.
|
|
1862
|
+
"""Adds a new column with the provided values.
|
|
1578
1863
|
|
|
1579
1864
|
Args:
|
|
1580
|
-
values:
|
|
1581
|
-
col_name:
|
|
1865
|
+
values: An iterable (e.g., list, tuple) of values to add as a new column.
|
|
1866
|
+
col_name: The name for the new column. Defaults to 'new_values'.
|
|
1582
1867
|
|
|
1583
1868
|
Returns:
|
|
1584
|
-
FlowDataEngine
|
|
1869
|
+
A new `FlowDataEngine` instance with the added column.
|
|
1585
1870
|
"""
|
|
1586
1871
|
if col_name is None:
|
|
1587
1872
|
col_name = 'new_values'
|
|
1588
1873
|
return FlowDataEngine(self.data_frame.with_columns(pl.Series(values).alias(col_name)))
|
|
1589
1874
|
|
|
1590
1875
|
def get_record_count(self) -> "FlowDataEngine":
|
|
1591
|
-
"""
|
|
1592
|
-
|
|
1876
|
+
"""Returns a new FlowDataEngine with a single column 'number_of_records'
|
|
1877
|
+
containing the total number of records.
|
|
1593
1878
|
|
|
1594
1879
|
Returns:
|
|
1595
|
-
FlowDataEngine
|
|
1880
|
+
A new `FlowDataEngine` instance.
|
|
1596
1881
|
"""
|
|
1597
1882
|
return FlowDataEngine(self.data_frame.select(pl.len().alias('number_of_records')))
|
|
1598
1883
|
|
|
1599
1884
|
def assert_equal(self, other: "FlowDataEngine", ordered: bool = True, strict_schema: bool = False):
|
|
1600
|
-
"""
|
|
1601
|
-
|
|
1885
|
+
"""Asserts that this DataFrame is equal to another.
|
|
1886
|
+
|
|
1887
|
+
Useful for testing.
|
|
1602
1888
|
|
|
1603
1889
|
Args:
|
|
1604
|
-
other:
|
|
1605
|
-
ordered:
|
|
1606
|
-
strict_schema:
|
|
1890
|
+
other: The other `FlowDataEngine` to compare with.
|
|
1891
|
+
ordered: If True, the row order must be identical.
|
|
1892
|
+
strict_schema: If True, the data types of the schemas must be identical.
|
|
1607
1893
|
|
|
1608
1894
|
Raises:
|
|
1609
|
-
Exception: If DataFrames are not equal
|
|
1895
|
+
Exception: If the DataFrames are not equal based on the specified criteria.
|
|
1610
1896
|
"""
|
|
1611
1897
|
org_laziness = self.lazy, other.lazy
|
|
1612
1898
|
self.lazy = False
|
|
@@ -1634,14 +1920,14 @@ class FlowDataEngine:
|
|
|
1634
1920
|
self.lazy, other.lazy = org_laziness
|
|
1635
1921
|
assert self_lf.equals(other_lf), 'Data is not equal'
|
|
1636
1922
|
|
|
1637
|
-
# Initialization Methods
|
|
1638
1923
|
def initialize_empty_fl(self):
|
|
1639
|
-
"""
|
|
1924
|
+
"""Initializes an empty LazyFrame."""
|
|
1640
1925
|
self.data_frame = pl.LazyFrame()
|
|
1641
1926
|
self.number_of_records = 0
|
|
1642
1927
|
self._lazy = True
|
|
1643
1928
|
|
|
1644
1929
|
def _calculate_number_of_records_in_worker(self) -> int:
|
|
1930
|
+
"""Calculates the number of records in a worker process."""
|
|
1645
1931
|
number_of_records = ExternalDfFetcher(
|
|
1646
1932
|
lf=self.data_frame,
|
|
1647
1933
|
operation_type="calculate_number_of_records",
|
|
@@ -1653,18 +1939,20 @@ class FlowDataEngine:
|
|
|
1653
1939
|
|
|
1654
1940
|
def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
|
|
1655
1941
|
calculate_in_worker_process: bool = False) -> int:
|
|
1656
|
-
"""
|
|
1657
|
-
|
|
1942
|
+
"""Gets the total number of records in the DataFrame.
|
|
1943
|
+
|
|
1944
|
+
For lazy frames, this may trigger a full data scan, which can be expensive.
|
|
1658
1945
|
|
|
1659
1946
|
Args:
|
|
1660
|
-
warn:
|
|
1661
|
-
force_calculate:
|
|
1662
|
-
calculate_in_worker_process:
|
|
1947
|
+
warn: If True, logs a warning if a potentially expensive calculation is triggered.
|
|
1948
|
+
force_calculate: If True, forces recalculation even if a value is cached.
|
|
1949
|
+
calculate_in_worker_process: If True, offloads the calculation to a worker process.
|
|
1950
|
+
|
|
1663
1951
|
Returns:
|
|
1664
|
-
|
|
1952
|
+
The total number of records.
|
|
1665
1953
|
|
|
1666
1954
|
Raises:
|
|
1667
|
-
|
|
1955
|
+
ValueError: If the number of records could not be determined.
|
|
1668
1956
|
"""
|
|
1669
1957
|
if self.is_future and not self.is_collected:
|
|
1670
1958
|
return -1
|
|
@@ -1675,37 +1963,39 @@ class FlowDataEngine:
|
|
|
1675
1963
|
|
|
1676
1964
|
if self.lazy:
|
|
1677
1965
|
if calculate_in_worker_process:
|
|
1678
|
-
self.number_of_records = self._calculate_number_of_records_in_worker()
|
|
1679
|
-
else:
|
|
1680
|
-
if warn:
|
|
1681
|
-
logger.warning('Calculating the number of records this can be expensive on a lazy frame')
|
|
1682
1966
|
try:
|
|
1683
|
-
self.number_of_records = self.
|
|
1684
|
-
|
|
1685
|
-
except Exception:
|
|
1686
|
-
|
|
1967
|
+
self.number_of_records = self._calculate_number_of_records_in_worker()
|
|
1968
|
+
return self.number_of_records
|
|
1969
|
+
except Exception as e:
|
|
1970
|
+
logger.error(f"Error: {e}")
|
|
1971
|
+
if warn:
|
|
1972
|
+
logger.warning('Calculating the number of records this can be expensive on a lazy frame')
|
|
1973
|
+
try:
|
|
1974
|
+
self.number_of_records = self.data_frame.select(pl.len()).collect(
|
|
1975
|
+
engine="streaming" if self._streamable else "auto")[0, 0]
|
|
1976
|
+
except Exception:
|
|
1977
|
+
raise ValueError('Could not get number of records')
|
|
1687
1978
|
else:
|
|
1688
1979
|
self.number_of_records = self.data_frame.__len__()
|
|
1689
1980
|
return self.number_of_records
|
|
1690
1981
|
|
|
1691
|
-
# Properties
|
|
1692
1982
|
@property
|
|
1693
1983
|
def has_errors(self) -> bool:
|
|
1694
|
-
"""
|
|
1984
|
+
"""Checks if there are any errors."""
|
|
1695
1985
|
return len(self.errors) > 0
|
|
1696
1986
|
|
|
1697
1987
|
@property
|
|
1698
1988
|
def lazy(self) -> bool:
|
|
1699
|
-
"""
|
|
1989
|
+
"""Indicates if the DataFrame is in lazy mode."""
|
|
1700
1990
|
return self._lazy
|
|
1701
1991
|
|
|
1702
1992
|
@lazy.setter
|
|
1703
1993
|
def lazy(self, exec_lazy: bool = False):
|
|
1704
|
-
"""
|
|
1705
|
-
Set the laziness of the DataFrame.
|
|
1994
|
+
"""Sets the laziness of the DataFrame.
|
|
1706
1995
|
|
|
1707
1996
|
Args:
|
|
1708
|
-
exec_lazy:
|
|
1997
|
+
exec_lazy: If True, converts the DataFrame to a LazyFrame. If False,
|
|
1998
|
+
collects the data and converts it to an eager DataFrame.
|
|
1709
1999
|
"""
|
|
1710
2000
|
if exec_lazy != self._lazy:
|
|
1711
2001
|
if exec_lazy:
|
|
@@ -1721,42 +2011,40 @@ class FlowDataEngine:
|
|
|
1721
2011
|
|
|
1722
2012
|
@property
|
|
1723
2013
|
def external_source(self) -> ExternalDataSource:
|
|
1724
|
-
"""
|
|
2014
|
+
"""The external data source, if any."""
|
|
1725
2015
|
return self._external_source
|
|
1726
2016
|
|
|
1727
2017
|
@property
|
|
1728
2018
|
def cols_idx(self) -> Dict[str, int]:
|
|
1729
|
-
"""
|
|
2019
|
+
"""A dictionary mapping column names to their integer index."""
|
|
1730
2020
|
if self._col_idx is None:
|
|
1731
2021
|
self._col_idx = {c: i for i, c in enumerate(self.columns)}
|
|
1732
2022
|
return self._col_idx
|
|
1733
2023
|
|
|
1734
2024
|
@property
|
|
1735
2025
|
def __name__(self) -> str:
|
|
1736
|
-
"""
|
|
2026
|
+
"""The name of the table."""
|
|
1737
2027
|
return self.name
|
|
1738
2028
|
|
|
1739
|
-
# Schema and Column Operations
|
|
1740
2029
|
def get_select_inputs(self) -> transform_schemas.SelectInputs:
|
|
1741
|
-
"""
|
|
1742
|
-
Get select inputs for all columns.
|
|
2030
|
+
"""Gets `SelectInput` specifications for all columns in the current schema.
|
|
1743
2031
|
|
|
1744
2032
|
Returns:
|
|
1745
|
-
SelectInputs
|
|
2033
|
+
A `SelectInputs` object that can be used to configure selection or
|
|
2034
|
+
transformation operations.
|
|
1746
2035
|
"""
|
|
1747
2036
|
return transform_schemas.SelectInputs(
|
|
1748
2037
|
[transform_schemas.SelectInput(old_name=c.name, data_type=c.data_type) for c in self.schema]
|
|
1749
2038
|
)
|
|
1750
2039
|
|
|
1751
2040
|
def select_columns(self, list_select: Union[List[str], Tuple[str], str]) -> "FlowDataEngine":
|
|
1752
|
-
"""
|
|
1753
|
-
Select specific columns from the DataFrame.
|
|
2041
|
+
"""Selects a subset of columns from the DataFrame.
|
|
1754
2042
|
|
|
1755
2043
|
Args:
|
|
1756
|
-
list_select:
|
|
2044
|
+
list_select: A list, tuple, or single string of column names to select.
|
|
1757
2045
|
|
|
1758
2046
|
Returns:
|
|
1759
|
-
FlowDataEngine
|
|
2047
|
+
A new `FlowDataEngine` instance containing only the selected columns.
|
|
1760
2048
|
"""
|
|
1761
2049
|
if isinstance(list_select, str):
|
|
1762
2050
|
list_select = [list_select]
|
|
@@ -1773,14 +2061,13 @@ class FlowDataEngine:
|
|
|
1773
2061
|
)
|
|
1774
2062
|
|
|
1775
2063
|
def drop_columns(self, columns: List[str]) -> "FlowDataEngine":
|
|
1776
|
-
"""
|
|
1777
|
-
Drop specified columns from the DataFrame.
|
|
2064
|
+
"""Drops specified columns from the DataFrame.
|
|
1778
2065
|
|
|
1779
2066
|
Args:
|
|
1780
|
-
columns:
|
|
2067
|
+
columns: A list of column names to drop.
|
|
1781
2068
|
|
|
1782
2069
|
Returns:
|
|
1783
|
-
FlowDataEngine
|
|
2070
|
+
A new `FlowDataEngine` instance without the dropped columns.
|
|
1784
2071
|
"""
|
|
1785
2072
|
cols_for_select = tuple(set(self.columns) - set(columns))
|
|
1786
2073
|
idx_to_keep = [self.cols_idx.get(c) for c in cols_for_select]
|
|
@@ -1793,14 +2080,13 @@ class FlowDataEngine:
|
|
|
1793
2080
|
)
|
|
1794
2081
|
|
|
1795
2082
|
def reorganize_order(self, column_order: List[str]) -> "FlowDataEngine":
|
|
1796
|
-
"""
|
|
1797
|
-
Reorganize columns in specified order.
|
|
2083
|
+
"""Reorganizes columns into a specified order.
|
|
1798
2084
|
|
|
1799
2085
|
Args:
|
|
1800
|
-
column_order:
|
|
2086
|
+
column_order: A list of column names in the desired order.
|
|
1801
2087
|
|
|
1802
2088
|
Returns:
|
|
1803
|
-
FlowDataEngine
|
|
2089
|
+
A new `FlowDataEngine` instance with the columns reordered.
|
|
1804
2090
|
"""
|
|
1805
2091
|
df = self.data_frame.select(column_order)
|
|
1806
2092
|
schema = sorted(self.schema, key=lambda x: column_order.index(x.column_name))
|
|
@@ -1808,16 +2094,15 @@ class FlowDataEngine:
|
|
|
1808
2094
|
|
|
1809
2095
|
def apply_flowfile_formula(self, func: str, col_name: str,
|
|
1810
2096
|
output_data_type: pl.DataType = None) -> "FlowDataEngine":
|
|
1811
|
-
"""
|
|
1812
|
-
Apply a formula to create a new column.
|
|
2097
|
+
"""Applies a formula to create a new column or transform an existing one.
|
|
1813
2098
|
|
|
1814
2099
|
Args:
|
|
1815
|
-
func:
|
|
1816
|
-
col_name:
|
|
1817
|
-
output_data_type:
|
|
2100
|
+
func: A string containing a Polars expression formula.
|
|
2101
|
+
col_name: The name of the new or transformed column.
|
|
2102
|
+
output_data_type: The desired Polars data type for the output column.
|
|
1818
2103
|
|
|
1819
2104
|
Returns:
|
|
1820
|
-
FlowDataEngine
|
|
2105
|
+
A new `FlowDataEngine` instance with the applied formula.
|
|
1821
2106
|
"""
|
|
1822
2107
|
parsed_func = to_expr(func)
|
|
1823
2108
|
if output_data_type is not None:
|
|
@@ -1829,16 +2114,15 @@ class FlowDataEngine:
|
|
|
1829
2114
|
|
|
1830
2115
|
def apply_sql_formula(self, func: str, col_name: str,
|
|
1831
2116
|
output_data_type: pl.DataType = None) -> "FlowDataEngine":
|
|
1832
|
-
"""
|
|
1833
|
-
Apply an SQL-style formula to create a new column.
|
|
2117
|
+
"""Applies an SQL-style formula using `pl.sql_expr`.
|
|
1834
2118
|
|
|
1835
2119
|
Args:
|
|
1836
|
-
func:
|
|
1837
|
-
col_name:
|
|
1838
|
-
output_data_type:
|
|
2120
|
+
func: A string containing an SQL expression.
|
|
2121
|
+
col_name: The name of the new or transformed column.
|
|
2122
|
+
output_data_type: The desired Polars data type for the output column.
|
|
1839
2123
|
|
|
1840
2124
|
Returns:
|
|
1841
|
-
FlowDataEngine
|
|
2125
|
+
A new `FlowDataEngine` instance with the applied formula.
|
|
1842
2126
|
"""
|
|
1843
2127
|
expr = to_expr(func)
|
|
1844
2128
|
if output_data_type not in (None, "Auto"):
|
|
@@ -1850,16 +2134,18 @@ class FlowDataEngine:
|
|
|
1850
2134
|
|
|
1851
2135
|
def output(self, output_fs: input_schema.OutputSettings, flow_id: int, node_id: int | str,
|
|
1852
2136
|
execute_remote: bool = True) -> "FlowDataEngine":
|
|
1853
|
-
"""
|
|
1854
|
-
|
|
2137
|
+
"""Writes the DataFrame to an output file.
|
|
2138
|
+
|
|
2139
|
+
Can execute the write operation locally or in a remote worker process.
|
|
1855
2140
|
|
|
1856
2141
|
Args:
|
|
1857
|
-
output_fs:
|
|
1858
|
-
flow_id:
|
|
1859
|
-
node_id:
|
|
1860
|
-
execute_remote: If
|
|
2142
|
+
output_fs: An `OutputSettings` object with details about the output file.
|
|
2143
|
+
flow_id: The flow ID for tracking.
|
|
2144
|
+
node_id: The node ID for tracking.
|
|
2145
|
+
execute_remote: If True, executes the write in a worker process.
|
|
2146
|
+
|
|
1861
2147
|
Returns:
|
|
1862
|
-
FlowDataEngine
|
|
2148
|
+
The same `FlowDataEngine` instance for chaining.
|
|
1863
2149
|
"""
|
|
1864
2150
|
logger.info('Starting to write output')
|
|
1865
2151
|
if execute_remote:
|
|
@@ -1891,30 +2177,28 @@ class FlowDataEngine:
|
|
|
1891
2177
|
logger.info("Finished writing output")
|
|
1892
2178
|
return self
|
|
1893
2179
|
|
|
1894
|
-
# Data Operations
|
|
1895
2180
|
def make_unique(self, unique_input: transform_schemas.UniqueInput = None) -> "FlowDataEngine":
|
|
1896
|
-
"""
|
|
1897
|
-
Get unique rows based on specified columns.
|
|
2181
|
+
"""Gets the unique rows from the DataFrame.
|
|
1898
2182
|
|
|
1899
2183
|
Args:
|
|
1900
|
-
unique_input:
|
|
2184
|
+
unique_input: A `UniqueInput` object specifying a subset of columns
|
|
2185
|
+
to consider for uniqueness and a strategy for keeping rows.
|
|
1901
2186
|
|
|
1902
2187
|
Returns:
|
|
1903
|
-
FlowDataEngine
|
|
2188
|
+
A new `FlowDataEngine` instance with unique rows.
|
|
1904
2189
|
"""
|
|
1905
2190
|
if unique_input is None or unique_input.columns is None:
|
|
1906
2191
|
return FlowDataEngine(self.data_frame.unique())
|
|
1907
2192
|
return FlowDataEngine(self.data_frame.unique(unique_input.columns, keep=unique_input.strategy))
|
|
1908
2193
|
|
|
1909
2194
|
def concat(self, other: Iterable["FlowDataEngine"] | "FlowDataEngine") -> "FlowDataEngine":
|
|
1910
|
-
"""
|
|
1911
|
-
Concatenate with other DataFrames.
|
|
2195
|
+
"""Concatenates this DataFrame with one or more other DataFrames.
|
|
1912
2196
|
|
|
1913
2197
|
Args:
|
|
1914
|
-
other:
|
|
2198
|
+
other: A single `FlowDataEngine` or an iterable of them.
|
|
1915
2199
|
|
|
1916
2200
|
Returns:
|
|
1917
|
-
FlowDataEngine
|
|
2201
|
+
A new `FlowDataEngine` containing the concatenated data.
|
|
1918
2202
|
"""
|
|
1919
2203
|
if isinstance(other, FlowDataEngine):
|
|
1920
2204
|
other = [other]
|
|
@@ -1924,15 +2208,15 @@ class FlowDataEngine:
|
|
|
1924
2208
|
|
|
1925
2209
|
def do_select(self, select_inputs: transform_schemas.SelectInputs,
|
|
1926
2210
|
keep_missing: bool = True) -> "FlowDataEngine":
|
|
1927
|
-
"""
|
|
1928
|
-
Perform complex column selection and transformation.
|
|
2211
|
+
"""Performs a complex column selection, renaming, and reordering operation.
|
|
1929
2212
|
|
|
1930
2213
|
Args:
|
|
1931
|
-
select_inputs:
|
|
1932
|
-
keep_missing:
|
|
2214
|
+
select_inputs: A `SelectInputs` object defining the desired transformations.
|
|
2215
|
+
keep_missing: If True, columns not specified in `select_inputs` are kept.
|
|
2216
|
+
If False, they are dropped.
|
|
1933
2217
|
|
|
1934
2218
|
Returns:
|
|
1935
|
-
|
|
2219
|
+
A new `FlowDataEngine` with the transformed selection.
|
|
1936
2220
|
"""
|
|
1937
2221
|
new_schema = deepcopy(self.schema)
|
|
1938
2222
|
renames = [r for r in select_inputs.renames if r.is_available]
|
|
@@ -1968,29 +2252,29 @@ class FlowDataEngine:
|
|
|
1968
2252
|
output_file = FlowDataEngine(ndf, number_of_records=self.number_of_records)
|
|
1969
2253
|
return output_file.reorganize_order(sorted_cols)
|
|
1970
2254
|
|
|
1971
|
-
# Utility Methods
|
|
1972
2255
|
def set_streamable(self, streamable: bool = False):
|
|
1973
|
-
"""
|
|
2256
|
+
"""Sets whether DataFrame operations should be streamable."""
|
|
1974
2257
|
self._streamable = streamable
|
|
1975
2258
|
|
|
1976
2259
|
def _calculate_schema(self) -> List[Dict]:
|
|
1977
|
-
"""
|
|
2260
|
+
"""Calculates schema statistics."""
|
|
1978
2261
|
if self.external_source is not None:
|
|
1979
2262
|
self.collect_external()
|
|
1980
2263
|
v = utils.calculate_schema(self.data_frame)
|
|
1981
2264
|
return v
|
|
1982
2265
|
|
|
1983
2266
|
def calculate_schema(self):
|
|
1984
|
-
"""
|
|
2267
|
+
"""Calculates and returns the schema."""
|
|
1985
2268
|
self._calculate_schema_stats = True
|
|
1986
2269
|
return self.schema
|
|
1987
2270
|
|
|
1988
2271
|
def count(self) -> int:
|
|
1989
|
-
"""
|
|
2272
|
+
"""Gets the total number of records."""
|
|
1990
2273
|
return self.get_number_of_records()
|
|
1991
2274
|
|
|
1992
2275
|
@classmethod
|
|
1993
2276
|
def create_from_path_worker(cls, received_table: input_schema.ReceivedTable, flow_id: int, node_id: int | str):
|
|
2277
|
+
"""Creates a FlowDataEngine from a path in a worker process."""
|
|
1994
2278
|
received_table.set_absolute_filepath()
|
|
1995
2279
|
external_fetcher = ExternalCreateFetcher(received_table=received_table,
|
|
1996
2280
|
file_type=received_table.file_type, flow_id=flow_id, node_id=node_id)
|
|
@@ -1998,14 +2282,19 @@ class FlowDataEngine:
|
|
|
1998
2282
|
|
|
1999
2283
|
|
|
2000
2284
|
def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowDataEngine":
|
|
2001
|
-
"""
|
|
2002
|
-
|
|
2285
|
+
"""Executes arbitrary Polars code on one or more FlowDataEngine objects.
|
|
2286
|
+
|
|
2287
|
+
This function takes a string of Python code that uses Polars and executes it.
|
|
2288
|
+
Input `FlowDataEngine` objects are made available in the code's scope as
|
|
2289
|
+
`input_df` (for a single input) or `input_df_1`, `input_df_2`, etc.
|
|
2003
2290
|
|
|
2004
2291
|
Args:
|
|
2005
|
-
|
|
2292
|
+
*flowfile_tables: A variable number of `FlowDataEngine` objects to be
|
|
2293
|
+
used as input to the code.
|
|
2294
|
+
code: A string containing the Polars code to execute.
|
|
2006
2295
|
|
|
2007
2296
|
Returns:
|
|
2008
|
-
FlowDataEngine
|
|
2297
|
+
A new `FlowDataEngine` instance containing the result of the executed code.
|
|
2009
2298
|
"""
|
|
2010
2299
|
polars_executable = polars_code_parser.get_executable(code, num_inputs=len(flowfile_tables))
|
|
2011
2300
|
if len(flowfile_tables) == 0:
|
|
@@ -2017,5 +2306,4 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
|
|
|
2017
2306
|
df = polars_executable(**kwargs)
|
|
2018
2307
|
if isinstance(df, pl.DataFrame):
|
|
2019
2308
|
logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
|
|
2020
|
-
return FlowDataEngine(df)
|
|
2021
|
-
|
|
2309
|
+
return FlowDataEngine(df)
|