Flowfile 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- build_backends/__init__.py +0 -0
- build_backends/main.py +313 -0
- build_backends/main_prd.py +202 -0
- flowfile/__init__.py +71 -0
- flowfile/__main__.py +24 -0
- flowfile-0.2.2.dist-info/LICENSE +21 -0
- flowfile-0.2.2.dist-info/METADATA +225 -0
- flowfile-0.2.2.dist-info/RECORD +171 -0
- flowfile-0.2.2.dist-info/WHEEL +4 -0
- flowfile-0.2.2.dist-info/entry_points.txt +9 -0
- flowfile_core/__init__.py +13 -0
- flowfile_core/auth/__init__.py +0 -0
- flowfile_core/auth/jwt.py +140 -0
- flowfile_core/auth/models.py +40 -0
- flowfile_core/auth/secrets.py +178 -0
- flowfile_core/configs/__init__.py +35 -0
- flowfile_core/configs/flow_logger.py +433 -0
- flowfile_core/configs/node_store/__init__.py +0 -0
- flowfile_core/configs/node_store/nodes.py +98 -0
- flowfile_core/configs/settings.py +120 -0
- flowfile_core/database/__init__.py +0 -0
- flowfile_core/database/connection.py +51 -0
- flowfile_core/database/init_db.py +45 -0
- flowfile_core/database/models.py +41 -0
- flowfile_core/fileExplorer/__init__.py +0 -0
- flowfile_core/fileExplorer/funcs.py +259 -0
- flowfile_core/fileExplorer/utils.py +53 -0
- flowfile_core/flowfile/FlowfileFlow.py +1403 -0
- flowfile_core/flowfile/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/__init__.py +0 -0
- flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
- flowfile_core/flowfile/analytics/__init__.py +0 -0
- flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
- flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
- flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
- flowfile_core/flowfile/analytics/utils.py +9 -0
- flowfile_core/flowfile/connection_manager/__init__.py +3 -0
- flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
- flowfile_core/flowfile/connection_manager/models.py +10 -0
- flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
- flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
- flowfile_core/flowfile/database_connection_manager/models.py +15 -0
- flowfile_core/flowfile/extensions.py +36 -0
- flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
- flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
- flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
- flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
- flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
- flowfile_core/flowfile/flow_data_engine/types.py +0 -0
- flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
- flowfile_core/flowfile/flow_node/__init__.py +0 -0
- flowfile_core/flowfile/flow_node/flow_node.py +771 -0
- flowfile_core/flowfile/flow_node/models.py +111 -0
- flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
- flowfile_core/flowfile/handler.py +123 -0
- flowfile_core/flowfile/manage/__init__.py +0 -0
- flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
- flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
- flowfile_core/flowfile/manage/open_flowfile.py +136 -0
- flowfile_core/flowfile/setting_generator/__init__.py +2 -0
- flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
- flowfile_core/flowfile/setting_generator/settings.py +176 -0
- flowfile_core/flowfile/sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
- flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
- flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
- flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
- flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
- flowfile_core/flowfile/util/__init__.py +0 -0
- flowfile_core/flowfile/util/calculate_layout.py +137 -0
- flowfile_core/flowfile/util/execution_orderer.py +141 -0
- flowfile_core/flowfile/utils.py +106 -0
- flowfile_core/main.py +138 -0
- flowfile_core/routes/__init__.py +0 -0
- flowfile_core/routes/auth.py +34 -0
- flowfile_core/routes/logs.py +163 -0
- flowfile_core/routes/public.py +10 -0
- flowfile_core/routes/routes.py +601 -0
- flowfile_core/routes/secrets.py +85 -0
- flowfile_core/run_lock.py +11 -0
- flowfile_core/schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
- flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
- flowfile_core/schemas/defaults.py +9 -0
- flowfile_core/schemas/external_sources/__init__.py +0 -0
- flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
- flowfile_core/schemas/input_schema.py +477 -0
- flowfile_core/schemas/models.py +193 -0
- flowfile_core/schemas/output_model.py +115 -0
- flowfile_core/schemas/schemas.py +106 -0
- flowfile_core/schemas/transform_schema.py +569 -0
- flowfile_core/secrets/__init__.py +0 -0
- flowfile_core/secrets/secrets.py +64 -0
- flowfile_core/utils/__init__.py +0 -0
- flowfile_core/utils/arrow_reader.py +247 -0
- flowfile_core/utils/excel_file_manager.py +18 -0
- flowfile_core/utils/fileManager.py +45 -0
- flowfile_core/utils/fl_executor.py +38 -0
- flowfile_core/utils/utils.py +8 -0
- flowfile_frame/__init__.py +56 -0
- flowfile_frame/__main__.py +12 -0
- flowfile_frame/adapters.py +17 -0
- flowfile_frame/expr.py +1163 -0
- flowfile_frame/flow_frame.py +2093 -0
- flowfile_frame/group_frame.py +199 -0
- flowfile_frame/join.py +75 -0
- flowfile_frame/selectors.py +242 -0
- flowfile_frame/utils.py +184 -0
- flowfile_worker/__init__.py +55 -0
- flowfile_worker/configs.py +95 -0
- flowfile_worker/create/__init__.py +37 -0
- flowfile_worker/create/funcs.py +146 -0
- flowfile_worker/create/models.py +86 -0
- flowfile_worker/create/pl_types.py +35 -0
- flowfile_worker/create/read_excel_tables.py +110 -0
- flowfile_worker/create/utils.py +84 -0
- flowfile_worker/external_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
- flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
- flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- flowfile_worker/external_sources/sql_source/__init__.py +0 -0
- flowfile_worker/external_sources/sql_source/main.py +56 -0
- flowfile_worker/external_sources/sql_source/models.py +72 -0
- flowfile_worker/flow_logger.py +58 -0
- flowfile_worker/funcs.py +327 -0
- flowfile_worker/main.py +108 -0
- flowfile_worker/models.py +95 -0
- flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
- flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
- flowfile_worker/polars_fuzzy_match/models.py +36 -0
- flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
- flowfile_worker/polars_fuzzy_match/process.py +86 -0
- flowfile_worker/polars_fuzzy_match/utils.py +50 -0
- flowfile_worker/process_manager.py +36 -0
- flowfile_worker/routes.py +440 -0
- flowfile_worker/secrets.py +148 -0
- flowfile_worker/spawner.py +187 -0
- flowfile_worker/utils.py +25 -0
- test_utils/__init__.py +3 -0
- test_utils/postgres/__init__.py +1 -0
- test_utils/postgres/commands.py +109 -0
- test_utils/postgres/fixtures.py +417 -0
|
@@ -0,0 +1,2093 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
import os
|
|
3
|
+
from typing import Any, Iterable, List, Literal, Optional, Tuple, Union
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
import polars as pl
|
|
8
|
+
from polars._typing import FrameInitTypes, SchemaDefinition, SchemaDict, Orientation
|
|
9
|
+
|
|
10
|
+
# Assume these imports are correct from your original context
|
|
11
|
+
from flowfile_core.flowfile.FlowfileFlow import FlowGraph, add_connection
|
|
12
|
+
from flowfile_core.flowfile.flow_data_engine.flow_data_engine import FlowDataEngine
|
|
13
|
+
from flowfile_core.flowfile.flow_node.flow_node import FlowNode
|
|
14
|
+
from flowfile_core.schemas import input_schema, transform_schema
|
|
15
|
+
|
|
16
|
+
from flowfile_frame.expr import Expr, Column, lit, col
|
|
17
|
+
from flowfile_frame.selectors import Selector
|
|
18
|
+
from flowfile_frame.group_frame import GroupByFrame
|
|
19
|
+
from flowfile_frame.utils import _parse_inputs_as_iterable, create_etl_graph
|
|
20
|
+
from flowfile_frame.join import _normalize_columns_to_list, _create_join_mappings
|
|
21
|
+
|
|
22
|
+
node_id_counter = 0
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _to_string_val(v) -> str:
|
|
26
|
+
if isinstance(v, str):
|
|
27
|
+
return f"'{v}'"
|
|
28
|
+
else:
|
|
29
|
+
return v
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def generate_node_id() -> int:
|
|
33
|
+
global node_id_counter
|
|
34
|
+
node_id_counter += 1
|
|
35
|
+
return node_id_counter
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class FlowFrame:
|
|
39
|
+
"""Main class that wraps FlowDataEngine and maintains the ETL graph."""
|
|
40
|
+
flow_graph: FlowGraph
|
|
41
|
+
data: pl.LazyFrame
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def create_from_any_type(
|
|
45
|
+
data: FrameInitTypes = None,
|
|
46
|
+
schema: SchemaDefinition | None = None,
|
|
47
|
+
*,
|
|
48
|
+
schema_overrides: SchemaDict | None = None,
|
|
49
|
+
strict: bool = True,
|
|
50
|
+
orient: Orientation | None = None,
|
|
51
|
+
infer_schema_length: int | None = 100,
|
|
52
|
+
nan_to_null: bool = False,
|
|
53
|
+
flow_graph=None,
|
|
54
|
+
node_id=None,
|
|
55
|
+
parent_node_id=None,
|
|
56
|
+
):
|
|
57
|
+
"""
|
|
58
|
+
Simple naive implementation of creating the frame from any type. It converts the data to a polars frame,
|
|
59
|
+
next it implements it from a manual_input
|
|
60
|
+
|
|
61
|
+
Parameters
|
|
62
|
+
----------
|
|
63
|
+
data : FrameInitTypes
|
|
64
|
+
Data to initialize the frame with
|
|
65
|
+
schema : SchemaDefinition, optional
|
|
66
|
+
Schema definition for the data
|
|
67
|
+
schema_overrides : pl.SchemaDict, optional
|
|
68
|
+
Schema overrides for specific columns
|
|
69
|
+
strict : bool, default True
|
|
70
|
+
Whether to enforce the schema strictly
|
|
71
|
+
orient : pl.Orientation, optional
|
|
72
|
+
Orientation of the data
|
|
73
|
+
infer_schema_length : int, default 100
|
|
74
|
+
Number of rows to use for schema inference
|
|
75
|
+
nan_to_null : bool, default False
|
|
76
|
+
Whether to convert NaN values to null
|
|
77
|
+
flow_graph : FlowGraph, optional
|
|
78
|
+
Existing ETL graph to add nodes to
|
|
79
|
+
node_id : int, optional
|
|
80
|
+
ID for the new node
|
|
81
|
+
parent_node_id : int, optional
|
|
82
|
+
ID of the parent node
|
|
83
|
+
|
|
84
|
+
Returns
|
|
85
|
+
-------
|
|
86
|
+
FlowFrame
|
|
87
|
+
A new FlowFrame with the data loaded as a manual input node
|
|
88
|
+
"""
|
|
89
|
+
# Extract flow-specific parameters
|
|
90
|
+
node_id = node_id or generate_node_id()
|
|
91
|
+
description = "Data imported from Python object"
|
|
92
|
+
|
|
93
|
+
# Create a new flow graph if none is provided
|
|
94
|
+
if flow_graph is None:
|
|
95
|
+
flow_graph = create_etl_graph()
|
|
96
|
+
|
|
97
|
+
flow_id = flow_graph.flow_id
|
|
98
|
+
|
|
99
|
+
# Convert data to a polars DataFrame/LazyFrame
|
|
100
|
+
try:
|
|
101
|
+
# Use polars to convert from various types
|
|
102
|
+
pl_df = pl.DataFrame(
|
|
103
|
+
data,
|
|
104
|
+
schema=schema,
|
|
105
|
+
schema_overrides=schema_overrides,
|
|
106
|
+
strict=strict,
|
|
107
|
+
orient=orient,
|
|
108
|
+
infer_schema_length=infer_schema_length,
|
|
109
|
+
nan_to_null=nan_to_null,
|
|
110
|
+
)
|
|
111
|
+
pl_data = pl_df.lazy()
|
|
112
|
+
except Exception as e:
|
|
113
|
+
raise ValueError(f"Could not convert data to a polars DataFrame: {e}")
|
|
114
|
+
|
|
115
|
+
# Create a FlowDataEngine to get data in the right format for manual input
|
|
116
|
+
flow_table = FlowDataEngine(raw_data=pl_data)
|
|
117
|
+
|
|
118
|
+
# Create a manual input node
|
|
119
|
+
input_node = input_schema.NodeManualInput(
|
|
120
|
+
flow_id=flow_id,
|
|
121
|
+
node_id=node_id,
|
|
122
|
+
raw_data=flow_table.to_pylist(), # Convert to list of dicts
|
|
123
|
+
pos_x=100,
|
|
124
|
+
pos_y=100,
|
|
125
|
+
is_setup=True,
|
|
126
|
+
description=description,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Add to graph
|
|
130
|
+
flow_graph.add_manual_input(input_node)
|
|
131
|
+
|
|
132
|
+
# Return new frame
|
|
133
|
+
return FlowFrame(
|
|
134
|
+
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
135
|
+
flow_graph=flow_graph,
|
|
136
|
+
node_id=node_id,
|
|
137
|
+
parent_node_id=parent_node_id,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
def __new__(
|
|
141
|
+
cls,
|
|
142
|
+
data: pl.LazyFrame | FrameInitTypes = None,
|
|
143
|
+
schema: SchemaDefinition | None = None,
|
|
144
|
+
*,
|
|
145
|
+
schema_overrides: SchemaDict | None = None,
|
|
146
|
+
strict: bool = True,
|
|
147
|
+
orient: Orientation | None = None,
|
|
148
|
+
infer_schema_length: int | None = 100,
|
|
149
|
+
nan_to_null: bool = False,
|
|
150
|
+
flow_graph=None,
|
|
151
|
+
node_id=None,
|
|
152
|
+
parent_node_id=None,
|
|
153
|
+
):
|
|
154
|
+
"""Create a new FlowFrame instance."""
|
|
155
|
+
|
|
156
|
+
# If data is not a LazyFrame, use the factory method
|
|
157
|
+
if data is not None and not isinstance(data, pl.LazyFrame):
|
|
158
|
+
return cls.create_from_any_type(
|
|
159
|
+
data=data,
|
|
160
|
+
schema=schema,
|
|
161
|
+
schema_overrides=schema_overrides,
|
|
162
|
+
strict=strict,
|
|
163
|
+
orient=orient,
|
|
164
|
+
infer_schema_length=infer_schema_length,
|
|
165
|
+
nan_to_null=nan_to_null,
|
|
166
|
+
flow_graph=flow_graph,
|
|
167
|
+
node_id=node_id,
|
|
168
|
+
parent_node_id=parent_node_id,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Otherwise create the instance normally
|
|
172
|
+
instance = super().__new__(cls)
|
|
173
|
+
return instance
|
|
174
|
+
|
|
175
|
+
def __init__(
|
|
176
|
+
self,
|
|
177
|
+
data: pl.LazyFrame | FrameInitTypes = None,
|
|
178
|
+
schema: SchemaDefinition | None = None,
|
|
179
|
+
*,
|
|
180
|
+
schema_overrides: SchemaDict | None = None,
|
|
181
|
+
strict: bool = True,
|
|
182
|
+
orient: Orientation | None = None,
|
|
183
|
+
infer_schema_length: int | None = 100,
|
|
184
|
+
nan_to_null: bool = False,
|
|
185
|
+
flow_graph=None,
|
|
186
|
+
node_id=None,
|
|
187
|
+
parent_node_id=None,
|
|
188
|
+
):
|
|
189
|
+
"""Initialize the FlowFrame with data and graph references."""
|
|
190
|
+
|
|
191
|
+
if data is None:
|
|
192
|
+
data = pl.LazyFrame()
|
|
193
|
+
if not isinstance(data, pl.LazyFrame):
|
|
194
|
+
return
|
|
195
|
+
|
|
196
|
+
self.node_id = node_id or generate_node_id()
|
|
197
|
+
self.parent_node_id = parent_node_id
|
|
198
|
+
|
|
199
|
+
# Initialize graph
|
|
200
|
+
if flow_graph is None:
|
|
201
|
+
flow_graph = create_etl_graph()
|
|
202
|
+
self.flow_graph = flow_graph
|
|
203
|
+
# Set up data
|
|
204
|
+
if isinstance(data, FlowDataEngine):
|
|
205
|
+
self.data = data.data_frame
|
|
206
|
+
else:
|
|
207
|
+
self.data = data
|
|
208
|
+
|
|
209
|
+
def __repr__(self):
|
|
210
|
+
return str(self.data)
|
|
211
|
+
|
|
212
|
+
def _add_connection(self, from_id, to_id, input_type: input_schema.InputType = "main"):
|
|
213
|
+
"""Helper method to add a connection between nodes"""
|
|
214
|
+
connection = input_schema.NodeConnection.create_from_simple_input(
|
|
215
|
+
from_id=from_id, to_id=to_id, input_type=input_type
|
|
216
|
+
)
|
|
217
|
+
add_connection(self.flow_graph, connection)
|
|
218
|
+
|
|
219
|
+
def _create_child_frame(self, new_node_id):
|
|
220
|
+
"""Helper method to create a new FlowFrame that's a child of this one"""
|
|
221
|
+
self._add_connection(self.node_id, new_node_id)
|
|
222
|
+
return FlowFrame(
|
|
223
|
+
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
224
|
+
flow_graph=self.flow_graph,
|
|
225
|
+
node_id=new_node_id,
|
|
226
|
+
parent_node_id=self.node_id,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
def sort(
|
|
230
|
+
self,
|
|
231
|
+
by: List[Expr | str] | Expr | str,
|
|
232
|
+
*more_by,
|
|
233
|
+
descending: bool | List[bool] = False,
|
|
234
|
+
nulls_last: bool = False,
|
|
235
|
+
multithreaded: bool = True,
|
|
236
|
+
maintain_order: bool = False,
|
|
237
|
+
description: str = None,
|
|
238
|
+
):
|
|
239
|
+
"""
|
|
240
|
+
Sort the dataframe by the given columns.
|
|
241
|
+
|
|
242
|
+
Parameters:
|
|
243
|
+
-----------
|
|
244
|
+
by : Expr, str, or list of Expr/str
|
|
245
|
+
Column(s) to sort by. Accepts expression input. Strings are parsed as column names.
|
|
246
|
+
*more_by : Expr or str
|
|
247
|
+
Additional columns to sort by, specified as positional arguments.
|
|
248
|
+
descending : bool or list of bool, default False
|
|
249
|
+
Sort in descending order. When sorting by multiple columns, can be specified per column.
|
|
250
|
+
nulls_last : bool or list of bool, default False
|
|
251
|
+
Place null values last; can specify a single boolean or a sequence for per-column control.
|
|
252
|
+
multithreaded : bool, default True
|
|
253
|
+
Sort using multiple threads.
|
|
254
|
+
maintain_order : bool, default False
|
|
255
|
+
Whether the order should be maintained if elements are equal.
|
|
256
|
+
description : str, optional
|
|
257
|
+
Description of this operation for the ETL graph.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
--------
|
|
261
|
+
FlowFrame
|
|
262
|
+
A new FlowFrame with sorted data.
|
|
263
|
+
"""
|
|
264
|
+
by = list(_parse_inputs_as_iterable((by,)))
|
|
265
|
+
new_node_id = generate_node_id()
|
|
266
|
+
sort_expressions = by
|
|
267
|
+
if more_by:
|
|
268
|
+
sort_expressions.extend(more_by)
|
|
269
|
+
|
|
270
|
+
# Determine if we need to use polars code fallback
|
|
271
|
+
needs_polars_code = False
|
|
272
|
+
|
|
273
|
+
# Check for any expressions that are not simple columns
|
|
274
|
+
for expr in sort_expressions:
|
|
275
|
+
if not isinstance(expr, (str, Column)) or (
|
|
276
|
+
isinstance(expr, Column) and expr._select_input.is_altered
|
|
277
|
+
):
|
|
278
|
+
needs_polars_code = True
|
|
279
|
+
break
|
|
280
|
+
|
|
281
|
+
# Also need polars code if we're using maintain_order or multithreaded params
|
|
282
|
+
if maintain_order or not multithreaded:
|
|
283
|
+
needs_polars_code = True
|
|
284
|
+
|
|
285
|
+
# Standardize descending parameter
|
|
286
|
+
if isinstance(descending, (list, tuple)):
|
|
287
|
+
# Ensure descending list has the same length as sort_expressions
|
|
288
|
+
if len(descending) != len(sort_expressions):
|
|
289
|
+
raise ValueError(
|
|
290
|
+
f"Length of descending ({len(descending)}) must match number of sort columns ({len(sort_expressions)})"
|
|
291
|
+
)
|
|
292
|
+
descending_values = descending
|
|
293
|
+
else:
|
|
294
|
+
descending_values = [descending] * len(sort_expressions)
|
|
295
|
+
|
|
296
|
+
# Standardize nulls_last parameter
|
|
297
|
+
if isinstance(nulls_last, (list, tuple)):
|
|
298
|
+
if len(nulls_last) != len(sort_expressions):
|
|
299
|
+
raise ValueError(
|
|
300
|
+
f"Length of nulls_last ({len(nulls_last)}) must match number of sort columns ({len(sort_expressions)})"
|
|
301
|
+
)
|
|
302
|
+
nulls_last_values = nulls_last
|
|
303
|
+
# Any non-default nulls_last needs polars code
|
|
304
|
+
if any(val is not False for val in nulls_last_values):
|
|
305
|
+
needs_polars_code = True
|
|
306
|
+
else:
|
|
307
|
+
nulls_last_values = [nulls_last] * len(sort_expressions)
|
|
308
|
+
# Non-default nulls_last needs polars code
|
|
309
|
+
if nulls_last:
|
|
310
|
+
needs_polars_code = True
|
|
311
|
+
|
|
312
|
+
if needs_polars_code:
|
|
313
|
+
# Generate polars code for complex cases
|
|
314
|
+
code = self._generate_sort_polars_code(
|
|
315
|
+
sort_expressions,
|
|
316
|
+
descending_values,
|
|
317
|
+
nulls_last_values,
|
|
318
|
+
multithreaded,
|
|
319
|
+
maintain_order,
|
|
320
|
+
)
|
|
321
|
+
self._add_polars_code(new_node_id, code, description)
|
|
322
|
+
else:
|
|
323
|
+
# Use native implementation for simple cases
|
|
324
|
+
sort_inputs = []
|
|
325
|
+
for i, expr in enumerate(sort_expressions):
|
|
326
|
+
# Convert expr to column name
|
|
327
|
+
if isinstance(expr, Column):
|
|
328
|
+
column_name = expr.name
|
|
329
|
+
elif isinstance(expr, str):
|
|
330
|
+
column_name = expr
|
|
331
|
+
else:
|
|
332
|
+
column_name = str(expr)
|
|
333
|
+
|
|
334
|
+
# Create SortByInput with appropriate settings
|
|
335
|
+
sort_inputs.append(
|
|
336
|
+
transform_schema.SortByInput(
|
|
337
|
+
column=column_name,
|
|
338
|
+
how="desc" if descending_values[i] else "asc",
|
|
339
|
+
)
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
sort_settings = input_schema.NodeSort(
|
|
343
|
+
flow_id=self.flow_graph.flow_id,
|
|
344
|
+
node_id=new_node_id,
|
|
345
|
+
sort_input=sort_inputs,
|
|
346
|
+
pos_x=200,
|
|
347
|
+
pos_y=150,
|
|
348
|
+
is_setup=True,
|
|
349
|
+
depending_on_id=self.node_id,
|
|
350
|
+
description=description
|
|
351
|
+
or f"Sort by {', '.join(str(e) for e in sort_expressions)}",
|
|
352
|
+
)
|
|
353
|
+
self.flow_graph.add_sort(sort_settings)
|
|
354
|
+
|
|
355
|
+
return self._create_child_frame(new_node_id)
|
|
356
|
+
|
|
357
|
+
def _generate_sort_polars_code(
|
|
358
|
+
self,
|
|
359
|
+
sort_expressions: list,
|
|
360
|
+
descending_values: list,
|
|
361
|
+
nulls_last_values: list,
|
|
362
|
+
multithreaded: bool,
|
|
363
|
+
maintain_order: bool,
|
|
364
|
+
) -> str:
|
|
365
|
+
"""Generate Polars code for sort operations that need fallback."""
|
|
366
|
+
# Format expressions for code
|
|
367
|
+
expr_strs = []
|
|
368
|
+
for expr in sort_expressions:
|
|
369
|
+
if isinstance(expr, (Expr, Column)):
|
|
370
|
+
expr_strs.append(str(expr))
|
|
371
|
+
elif isinstance(expr, str):
|
|
372
|
+
expr_strs.append(f"'{expr}'")
|
|
373
|
+
else:
|
|
374
|
+
expr_strs.append(str(expr))
|
|
375
|
+
|
|
376
|
+
# Format parameters
|
|
377
|
+
if len(sort_expressions) == 1:
|
|
378
|
+
by_arg = expr_strs[0]
|
|
379
|
+
else:
|
|
380
|
+
by_arg = f"[{', '.join(expr_strs)}]"
|
|
381
|
+
|
|
382
|
+
# Build kwargs
|
|
383
|
+
kwargs = {}
|
|
384
|
+
|
|
385
|
+
# Only add descending if it's non-default
|
|
386
|
+
if any(d for d in descending_values):
|
|
387
|
+
if len(descending_values) == 1:
|
|
388
|
+
kwargs["descending"] = descending_values[0]
|
|
389
|
+
else:
|
|
390
|
+
kwargs["descending"] = descending_values
|
|
391
|
+
|
|
392
|
+
# Only add nulls_last if it's non-default
|
|
393
|
+
if any(nl for nl in nulls_last_values):
|
|
394
|
+
if len(nulls_last_values) == 1:
|
|
395
|
+
kwargs["nulls_last"] = nulls_last_values[0]
|
|
396
|
+
else:
|
|
397
|
+
kwargs["nulls_last"] = nulls_last_values
|
|
398
|
+
|
|
399
|
+
# Add other parameters if they're non-default
|
|
400
|
+
if not multithreaded:
|
|
401
|
+
kwargs["multithreaded"] = multithreaded
|
|
402
|
+
|
|
403
|
+
if maintain_order:
|
|
404
|
+
kwargs["maintain_order"] = maintain_order
|
|
405
|
+
|
|
406
|
+
# Build kwargs string
|
|
407
|
+
kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items())
|
|
408
|
+
|
|
409
|
+
# Build final code
|
|
410
|
+
if kwargs_str:
|
|
411
|
+
return f"input_df.sort({by_arg}, {kwargs_str})"
|
|
412
|
+
else:
|
|
413
|
+
return f"input_df.sort({by_arg})"
|
|
414
|
+
|
|
415
|
+
def _add_polars_code(self, new_node_id: int, code: str, description: str = None,
|
|
416
|
+
depending_on_ids: List[str] | None = None):
|
|
417
|
+
polars_code_settings = input_schema.NodePolarsCode(
|
|
418
|
+
flow_id=self.flow_graph.flow_id,
|
|
419
|
+
node_id=new_node_id,
|
|
420
|
+
polars_code_input=transform_schema.PolarsCodeInput(polars_code=code),
|
|
421
|
+
is_setup=True,
|
|
422
|
+
depending_on_ids=depending_on_ids if depending_on_ids is not None else [self.node_id],
|
|
423
|
+
description=description,
|
|
424
|
+
)
|
|
425
|
+
self.flow_graph.add_polars_code(polars_code_settings)
|
|
426
|
+
|
|
427
|
+
def join(
|
|
428
|
+
self,
|
|
429
|
+
other,
|
|
430
|
+
on: List[str | Column] | str | Column = None,
|
|
431
|
+
how: str = "inner",
|
|
432
|
+
left_on: List[str | Column] | str | Column = None,
|
|
433
|
+
right_on: List[str | Column] | str | Column = None,
|
|
434
|
+
suffix: str = "_right",
|
|
435
|
+
validate: str = None,
|
|
436
|
+
nulls_equal: bool = False,
|
|
437
|
+
coalesce: bool = None,
|
|
438
|
+
maintain_order: Literal[None, "left", "right", "left_right", "right_left"] = None,
|
|
439
|
+
description: str = None,
|
|
440
|
+
):
|
|
441
|
+
"""
|
|
442
|
+
Add a join operation to the Logical Plan.
|
|
443
|
+
|
|
444
|
+
Parameters
|
|
445
|
+
----------
|
|
446
|
+
other : FlowFrame
|
|
447
|
+
Other DataFrame.
|
|
448
|
+
on : str or list of str, optional
|
|
449
|
+
Name(s) of the join columns in both DataFrames.
|
|
450
|
+
how : {'inner', 'left', 'outer', 'semi', 'anti', 'cross'}, default 'inner'
|
|
451
|
+
Join strategy.
|
|
452
|
+
left_on : str or list of str, optional
|
|
453
|
+
Name(s) of the left join column(s).
|
|
454
|
+
right_on : str or list of str, optional
|
|
455
|
+
Name(s) of the right join column(s).
|
|
456
|
+
suffix : str, default "_right"
|
|
457
|
+
Suffix to add to columns with a duplicate name.
|
|
458
|
+
validate : {"1:1", "1:m", "m:1", "m:m"}, optional
|
|
459
|
+
Validate join relationship.
|
|
460
|
+
nulls_equal:
|
|
461
|
+
Join on null values. By default null values will never produce matches.
|
|
462
|
+
coalesce:
|
|
463
|
+
None: -> join specific.
|
|
464
|
+
True: -> Always coalesce join columns.
|
|
465
|
+
False: -> Never coalesce join columns.
|
|
466
|
+
maintain_order:
|
|
467
|
+
Which DataFrame row order to preserve, if any. Do not rely on any observed ordering without explicitly setting this parameter, as your code may break in a future release. Not specifying any ordering can improve performance Supported for inner, left, right and full joins
|
|
468
|
+
None: No specific ordering is desired. The ordering might differ across Polars versions or even between different runs.
|
|
469
|
+
left: Preserves the order of the left DataFrame.
|
|
470
|
+
right: Preserves the order of the right DataFrame.
|
|
471
|
+
left_right: First preserves the order of the left DataFrame, then the right.
|
|
472
|
+
right_left: First preserves the order of the right DataFrame, then the left.
|
|
473
|
+
description : str, optional
|
|
474
|
+
Description of the join operation for the ETL graph.
|
|
475
|
+
|
|
476
|
+
Returns
|
|
477
|
+
-------
|
|
478
|
+
FlowFrame
|
|
479
|
+
New FlowFrame with join operation applied.
|
|
480
|
+
"""
|
|
481
|
+
new_node_id = generate_node_id()
|
|
482
|
+
print('new node id', new_node_id)
|
|
483
|
+
use_polars_code = not(maintain_order is None and
|
|
484
|
+
coalesce is None and
|
|
485
|
+
nulls_equal is False and
|
|
486
|
+
validate is None and
|
|
487
|
+
suffix == '_right')
|
|
488
|
+
join_mappings = None
|
|
489
|
+
if on is not None:
|
|
490
|
+
left_columns = right_columns = _normalize_columns_to_list(on)
|
|
491
|
+
elif left_on is not None and right_on is not None:
|
|
492
|
+
left_columns = _normalize_columns_to_list(left_on)
|
|
493
|
+
right_columns = _normalize_columns_to_list(right_on)
|
|
494
|
+
elif how == 'cross' and left_on is None and right_on is None and on is None:
|
|
495
|
+
left_columns = None
|
|
496
|
+
right_columns = None
|
|
497
|
+
else:
|
|
498
|
+
raise ValueError("Must specify either 'on' or both 'left_on' and 'right_on'")
|
|
499
|
+
|
|
500
|
+
# Ensure left and right column lists have same length
|
|
501
|
+
if how != 'cross' and len(left_columns) != len(right_columns):
|
|
502
|
+
raise ValueError(
|
|
503
|
+
f"Length mismatch: left columns ({len(left_columns)}) != right columns ({len(right_columns)})"
|
|
504
|
+
)
|
|
505
|
+
if not use_polars_code:
|
|
506
|
+
join_mappings, use_polars_code = _create_join_mappings(
|
|
507
|
+
left_columns, right_columns
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
if use_polars_code or suffix != '_right':
|
|
511
|
+
_on = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in _normalize_columns_to_list(on)) + "]" if on else None
|
|
512
|
+
_left = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in left_columns) + "]" if left_on else None
|
|
513
|
+
_right = "["+', '.join(f"'{v}'" if isinstance(v, str) else str(v) for v in right_columns) + "]" if right_on else None
|
|
514
|
+
code_kwargs = {"other": "input_df_2", "how": _to_string_val(how), "on": _on, "left_on": _left,
|
|
515
|
+
"right_on": _right, "suffix": _to_string_val(suffix), "validate": _to_string_val(validate),
|
|
516
|
+
"nulls_equal": nulls_equal, "coalesce": coalesce,
|
|
517
|
+
"maintain_order": _to_string_val(maintain_order)}
|
|
518
|
+
kwargs_str = ", ".join(f"{k}={v}" for k, v in code_kwargs.items() if v is not None)
|
|
519
|
+
code = f"input_df_1.join({kwargs_str})"
|
|
520
|
+
self._add_polars_code(new_node_id, code, description, depending_on_ids=[self.node_id, other.node_id])
|
|
521
|
+
self._add_connection(self.node_id, new_node_id, "main")
|
|
522
|
+
other._add_connection(other.node_id, new_node_id, "main")
|
|
523
|
+
result_frame = FlowFrame(
|
|
524
|
+
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
525
|
+
flow_graph=self.flow_graph,
|
|
526
|
+
node_id=new_node_id,
|
|
527
|
+
parent_node_id=self.node_id,
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
elif join_mappings:
|
|
531
|
+
left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
|
|
532
|
+
right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
|
|
533
|
+
|
|
534
|
+
join_input = transform_schema.JoinInput(
|
|
535
|
+
join_mapping=join_mappings,
|
|
536
|
+
left_select=left_select.renames,
|
|
537
|
+
right_select=right_select.renames,
|
|
538
|
+
how=how,
|
|
539
|
+
)
|
|
540
|
+
join_input.auto_rename()
|
|
541
|
+
# Create node settings
|
|
542
|
+
join_settings = input_schema.NodeJoin(
|
|
543
|
+
flow_id=self.flow_graph.flow_id,
|
|
544
|
+
node_id=new_node_id,
|
|
545
|
+
join_input=join_input,
|
|
546
|
+
auto_generate_selection=True,
|
|
547
|
+
verify_integrity=True,
|
|
548
|
+
pos_x=200,
|
|
549
|
+
pos_y=150,
|
|
550
|
+
is_setup=True,
|
|
551
|
+
depending_on_ids=[self.node_id, other.node_id],
|
|
552
|
+
description=description or f"Join with {how} strategy",
|
|
553
|
+
)
|
|
554
|
+
self.flow_graph.add_join(join_settings)
|
|
555
|
+
self._add_connection(self.node_id, new_node_id, "main")
|
|
556
|
+
other._add_connection(other.node_id, new_node_id, "right")
|
|
557
|
+
result_frame = FlowFrame(
|
|
558
|
+
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
559
|
+
flow_graph=self.flow_graph,
|
|
560
|
+
node_id=new_node_id,
|
|
561
|
+
parent_node_id=self.node_id,
|
|
562
|
+
)
|
|
563
|
+
else:
|
|
564
|
+
raise ValueError("Could not execute join")
|
|
565
|
+
|
|
566
|
+
return result_frame
|
|
567
|
+
|
|
568
|
+
def _add_number_of_records(self, new_node_id: int, description: str = None) -> "FlowFrame":
|
|
569
|
+
node_number_of_records = input_schema.NodeRecordCount(
|
|
570
|
+
flow_id=self.flow_graph.flow_id,
|
|
571
|
+
node_id=new_node_id,
|
|
572
|
+
pos_x=200,
|
|
573
|
+
pos_y=100,
|
|
574
|
+
is_setup=True,
|
|
575
|
+
depending_on_id=self.node_id,
|
|
576
|
+
description=description
|
|
577
|
+
)
|
|
578
|
+
self.flow_graph.add_record_count(node_number_of_records)
|
|
579
|
+
return self._create_child_frame(new_node_id)
|
|
580
|
+
|
|
581
|
+
def select(self, *columns, description: str = None):
|
|
582
|
+
"""
|
|
583
|
+
Select columns from the frame.
|
|
584
|
+
|
|
585
|
+
Args:
|
|
586
|
+
*columns: Column names or expressions
|
|
587
|
+
description: Description of the step, this will be shown in the flowfile file
|
|
588
|
+
|
|
589
|
+
Returns:
|
|
590
|
+
A new FlowFrame with selected columns
|
|
591
|
+
"""
|
|
592
|
+
# Create new node ID
|
|
593
|
+
columns = _parse_inputs_as_iterable(columns)
|
|
594
|
+
new_node_id = generate_node_id()
|
|
595
|
+
existing_columns = self.columns
|
|
596
|
+
|
|
597
|
+
if (len(columns) == 1 and isinstance(columns[0], Expr)
|
|
598
|
+
and str(columns[0]) == "pl.Expr(len()).alias('number_of_records')"):
|
|
599
|
+
return self._add_number_of_records(new_node_id, description)
|
|
600
|
+
|
|
601
|
+
# Handle simple column names
|
|
602
|
+
if all(isinstance(col_, (str, Column)) for col_ in columns):
|
|
603
|
+
# Create select inputs
|
|
604
|
+
select_inputs = [
|
|
605
|
+
transform_schema.SelectInput(old_name=col_) if isinstance(col_, str) else col_.to_select_input()
|
|
606
|
+
for col_ in columns
|
|
607
|
+
]
|
|
608
|
+
dropped_columns = [transform_schema.SelectInput(c, keep=False) for c in existing_columns if
|
|
609
|
+
c not in [s.old_name for s in select_inputs]]
|
|
610
|
+
select_inputs.extend(dropped_columns)
|
|
611
|
+
select_settings = input_schema.NodeSelect(
|
|
612
|
+
flow_id=self.flow_graph.flow_id,
|
|
613
|
+
node_id=new_node_id,
|
|
614
|
+
select_input=select_inputs,
|
|
615
|
+
keep_missing=False,
|
|
616
|
+
pos_x=200,
|
|
617
|
+
pos_y=100,
|
|
618
|
+
is_setup=True,
|
|
619
|
+
depending_on_id=self.node_id,
|
|
620
|
+
description=description
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
# Add to graph
|
|
624
|
+
self.flow_graph.add_select(select_settings)
|
|
625
|
+
return self._create_child_frame(new_node_id)
|
|
626
|
+
|
|
627
|
+
else:
|
|
628
|
+
readable_exprs = []
|
|
629
|
+
is_readable: bool = True
|
|
630
|
+
for col_ in columns:
|
|
631
|
+
if isinstance(col_, Expr):
|
|
632
|
+
readable_exprs.append(col_)
|
|
633
|
+
elif isinstance(col_, Selector):
|
|
634
|
+
readable_exprs.append(col_)
|
|
635
|
+
elif isinstance(col_, pl.expr.Expr):
|
|
636
|
+
print('warning this cannot be converted to flowfile frontend. Make sure you use the flowfile expr')
|
|
637
|
+
is_readable = False
|
|
638
|
+
elif isinstance(col_, str) and col_ in self.columns:
|
|
639
|
+
col_expr = Column(col_)
|
|
640
|
+
readable_exprs.append(col_expr)
|
|
641
|
+
else:
|
|
642
|
+
lit_expr = lit(col_)
|
|
643
|
+
readable_exprs.append(lit_expr)
|
|
644
|
+
if is_readable:
|
|
645
|
+
code = f"input_df.select([{', '.join(str(e) for e in readable_exprs)}])"
|
|
646
|
+
else:
|
|
647
|
+
raise ValueError('Not supported')
|
|
648
|
+
|
|
649
|
+
self._add_polars_code(new_node_id, code, description)
|
|
650
|
+
return self._create_child_frame(new_node_id)
|
|
651
|
+
|
|
652
|
+
def filter(self, predicate: Expr | Any = None, *, flowfile_formula: str = None, description: str = None):
|
|
653
|
+
"""
|
|
654
|
+
Filter rows based on a predicate.
|
|
655
|
+
|
|
656
|
+
Args:
|
|
657
|
+
predicate: Filter condition
|
|
658
|
+
flowfile_formula: Native support in frontend
|
|
659
|
+
description: Description of the step that is performed
|
|
660
|
+
Returns:
|
|
661
|
+
A new FlowFrame with filtered rows
|
|
662
|
+
"""
|
|
663
|
+
new_node_id = generate_node_id()
|
|
664
|
+
# Create new node ID
|
|
665
|
+
if predicate:
|
|
666
|
+
# we use for now the fallback on polars code.
|
|
667
|
+
if isinstance(predicate, Expr):
|
|
668
|
+
predicate_expr = predicate
|
|
669
|
+
else:
|
|
670
|
+
predicate_expr = lit(predicate)
|
|
671
|
+
code = f"input_df.filter({str(predicate_expr)})"
|
|
672
|
+
self._add_polars_code(new_node_id, code, description)
|
|
673
|
+
|
|
674
|
+
elif flowfile_formula:
|
|
675
|
+
# Create node settings
|
|
676
|
+
filter_settings = input_schema.NodeFilter(
|
|
677
|
+
flow_id=self.flow_graph.flow_id,
|
|
678
|
+
node_id=new_node_id,
|
|
679
|
+
filter_input=transform_schema.FilterInput(
|
|
680
|
+
advanced_filter=flowfile_formula,
|
|
681
|
+
filter_type="advanced"
|
|
682
|
+
),
|
|
683
|
+
pos_x=200,
|
|
684
|
+
pos_y=150,
|
|
685
|
+
is_setup=True,
|
|
686
|
+
depending_on_id=self.node_id,
|
|
687
|
+
description=description
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
self.flow_graph.add_filter(filter_settings)
|
|
691
|
+
|
|
692
|
+
return self._create_child_frame(new_node_id)
|
|
693
|
+
|
|
694
|
+
def sink_csv(self,
|
|
695
|
+
file: str,
|
|
696
|
+
*args,
|
|
697
|
+
separator: str = ",",
|
|
698
|
+
encoding: str = "utf-8",
|
|
699
|
+
description: str = None):
|
|
700
|
+
"""
|
|
701
|
+
Write the data to a CSV file.
|
|
702
|
+
|
|
703
|
+
Args:
|
|
704
|
+
path: Path or filename for the CSV file
|
|
705
|
+
separator: Field delimiter to use, defaults to ','
|
|
706
|
+
encoding: File encoding, defaults to 'utf-8'
|
|
707
|
+
description: Description of this operation for the ETL graph
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
Self for method chaining
|
|
711
|
+
"""
|
|
712
|
+
return self.write_csv(file, *args, separator=separator, encoding=encoding, description=description)
|
|
713
|
+
|
|
714
|
+
def write_parquet(
|
|
715
|
+
self,
|
|
716
|
+
path: str|os.PathLike,
|
|
717
|
+
*,
|
|
718
|
+
description: str = None,
|
|
719
|
+
convert_to_absolute_path: bool = True,
|
|
720
|
+
**kwargs: Any,
|
|
721
|
+
) -> "FlowFrame":
|
|
722
|
+
"""
|
|
723
|
+
Write the data to a Parquet file. Creates a standard Output node if only
|
|
724
|
+
'path' and standard options are provided. Falls back to a Polars Code node
|
|
725
|
+
if other keyword arguments are used.
|
|
726
|
+
|
|
727
|
+
Args:
|
|
728
|
+
path: Path (string or pathlib.Path) or filename for the Parquet file.
|
|
729
|
+
Note: Writable file-like objects are not supported when using advanced options
|
|
730
|
+
that trigger the Polars Code node fallback.
|
|
731
|
+
description: Description of this operation for the ETL graph.
|
|
732
|
+
convert_to_absolute_path: If the path needs to be set to a fixed location.
|
|
733
|
+
**kwargs: Additional keyword arguments for polars.DataFrame.sink_parquet/write_parquet.
|
|
734
|
+
If any kwargs other than 'description' or 'convert_to_absolute_path' are provided,
|
|
735
|
+
a Polars Code node will be created instead of a standard Output node.
|
|
736
|
+
Complex objects like IO streams or credential provider functions are NOT
|
|
737
|
+
supported via this method's Polars Code fallback.
|
|
738
|
+
|
|
739
|
+
Returns:
|
|
740
|
+
Self for method chaining (new FlowFrame pointing to the output node).
|
|
741
|
+
"""
|
|
742
|
+
new_node_id = generate_node_id()
|
|
743
|
+
|
|
744
|
+
is_path_input = isinstance(path, (str, os.PathLike))
|
|
745
|
+
if isinstance(path, os.PathLike):
|
|
746
|
+
file_str = str(path)
|
|
747
|
+
elif isinstance(path, str):
|
|
748
|
+
file_str = path
|
|
749
|
+
else:
|
|
750
|
+
file_str = path
|
|
751
|
+
is_path_input = False
|
|
752
|
+
if "~" in file_str:
|
|
753
|
+
file_str = os.path.expanduser(file_str)
|
|
754
|
+
file_name = file_str.split(os.sep)[-1]
|
|
755
|
+
use_polars_code = bool(kwargs.items()) or not is_path_input
|
|
756
|
+
|
|
757
|
+
output_parquet_table = input_schema.OutputParquetTable(
|
|
758
|
+
file_type="parquet"
|
|
759
|
+
)
|
|
760
|
+
output_settings = input_schema.OutputSettings(
|
|
761
|
+
file_type='parquet',
|
|
762
|
+
name=file_name,
|
|
763
|
+
directory=file_str if is_path_input else str(file_str),
|
|
764
|
+
output_parquet_table=output_parquet_table,
|
|
765
|
+
output_csv_table=input_schema.OutputCsvTable(),
|
|
766
|
+
output_excel_table=input_schema.OutputExcelTable()
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
if is_path_input:
|
|
770
|
+
try:
|
|
771
|
+
output_settings.set_absolute_filepath()
|
|
772
|
+
if convert_to_absolute_path:
|
|
773
|
+
output_settings.directory = output_settings.abs_file_path
|
|
774
|
+
except Exception as e:
|
|
775
|
+
print(f"Warning: Could not determine absolute path for {file_str}: {e}")
|
|
776
|
+
|
|
777
|
+
if not use_polars_code:
|
|
778
|
+
node_output = input_schema.NodeOutput(
|
|
779
|
+
flow_id=self.flow_graph.flow_id,
|
|
780
|
+
node_id=new_node_id,
|
|
781
|
+
output_settings=output_settings,
|
|
782
|
+
depending_on_id=self.node_id,
|
|
783
|
+
description=description
|
|
784
|
+
)
|
|
785
|
+
self.flow_graph.add_output(node_output)
|
|
786
|
+
else:
|
|
787
|
+
if not is_path_input:
|
|
788
|
+
raise TypeError(
|
|
789
|
+
f"Input 'path' must be a string or Path-like object when using advanced "
|
|
790
|
+
f"write_parquet options (kwargs={kwargs.items()}), got {type(path)}."
|
|
791
|
+
" File-like objects are not supported with the Polars Code fallback."
|
|
792
|
+
)
|
|
793
|
+
|
|
794
|
+
# Use the potentially converted absolute path string
|
|
795
|
+
path_arg_repr = repr(output_settings.directory)
|
|
796
|
+
kwargs_repr = ", ".join(f"{k}={repr(v)}" for k, v in kwargs.items())
|
|
797
|
+
args_str = f"path={path_arg_repr}"
|
|
798
|
+
if kwargs_repr:
|
|
799
|
+
args_str += f", {kwargs_repr}"
|
|
800
|
+
|
|
801
|
+
# Use sink_parquet for LazyFrames
|
|
802
|
+
code = f"input_df.sink_parquet({args_str})"
|
|
803
|
+
print(f"Generated Polars Code: {code}")
|
|
804
|
+
self._add_polars_code(new_node_id, code, description)
|
|
805
|
+
|
|
806
|
+
return self._create_child_frame(new_node_id)
|
|
807
|
+
|
|
808
|
+
def write_csv(
|
|
809
|
+
self,
|
|
810
|
+
file: str | os.PathLike,
|
|
811
|
+
*,
|
|
812
|
+
separator: str = ",",
|
|
813
|
+
encoding: str = "utf-8",
|
|
814
|
+
description: str = None,
|
|
815
|
+
convert_to_absolute_path: bool = True,
|
|
816
|
+
**kwargs: Any,
|
|
817
|
+
) -> "FlowFrame":
|
|
818
|
+
|
|
819
|
+
new_node_id = generate_node_id()
|
|
820
|
+
|
|
821
|
+
is_path_input = isinstance(file, (str, os.PathLike))
|
|
822
|
+
if isinstance(file, os.PathLike):
|
|
823
|
+
file_str = str(file)
|
|
824
|
+
elif isinstance(file, str):
|
|
825
|
+
file_str = file
|
|
826
|
+
else:
|
|
827
|
+
file_str = file
|
|
828
|
+
is_path_input = False
|
|
829
|
+
if "~" in file_str:
|
|
830
|
+
file_str = os.path.expanduser(file_str)
|
|
831
|
+
file_name = file_str.split(os.sep)[-1] if is_path_input else "output.csv"
|
|
832
|
+
|
|
833
|
+
use_polars_code = bool(kwargs) or not is_path_input
|
|
834
|
+
|
|
835
|
+
output_settings = input_schema.OutputSettings(
|
|
836
|
+
file_type='csv',
|
|
837
|
+
name=file_name,
|
|
838
|
+
directory=file_str if is_path_input else str(file_str),
|
|
839
|
+
output_csv_table=input_schema.OutputCsvTable(
|
|
840
|
+
file_type="csv", delimiter=separator, encoding=encoding),
|
|
841
|
+
output_excel_table=input_schema.OutputExcelTable(),
|
|
842
|
+
output_parquet_table=input_schema.OutputParquetTable()
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
if is_path_input:
|
|
846
|
+
try:
|
|
847
|
+
output_settings.set_absolute_filepath()
|
|
848
|
+
if convert_to_absolute_path:
|
|
849
|
+
output_settings.directory = output_settings.abs_file_path
|
|
850
|
+
except Exception as e:
|
|
851
|
+
print(f"Warning: Could not determine absolute path for {file_str}: {e}")
|
|
852
|
+
|
|
853
|
+
if not use_polars_code:
|
|
854
|
+
node_output = input_schema.NodeOutput(
|
|
855
|
+
flow_id=self.flow_graph.flow_id,
|
|
856
|
+
node_id=new_node_id,
|
|
857
|
+
output_settings=output_settings,
|
|
858
|
+
depending_on_id=self.node_id,
|
|
859
|
+
description=description
|
|
860
|
+
)
|
|
861
|
+
self.flow_graph.add_output(node_output)
|
|
862
|
+
else:
|
|
863
|
+
if not is_path_input:
|
|
864
|
+
raise TypeError(
|
|
865
|
+
f"Input 'file' must be a string or Path-like object when using advanced "
|
|
866
|
+
f"write_csv options (kwargs={kwargs}), got {type(file)}."
|
|
867
|
+
" File-like objects are not supported with the Polars Code fallback."
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
path_arg_repr = repr(output_settings.directory)
|
|
871
|
+
|
|
872
|
+
all_kwargs_for_code = {
|
|
873
|
+
'separator': separator,
|
|
874
|
+
'encoding': encoding,
|
|
875
|
+
**kwargs # Add the extra kwargs
|
|
876
|
+
}
|
|
877
|
+
kwargs_repr = ", ".join(f"{k}={repr(v)}" for k, v in all_kwargs_for_code.items())
|
|
878
|
+
|
|
879
|
+
args_str = f"file={path_arg_repr}"
|
|
880
|
+
if kwargs_repr:
|
|
881
|
+
args_str += f", {kwargs_repr}"
|
|
882
|
+
|
|
883
|
+
code = f"input_df.collect().write_csv({args_str})"
|
|
884
|
+
print(f"Generated Polars Code: {code}")
|
|
885
|
+
self._add_polars_code(new_node_id, code, description)
|
|
886
|
+
|
|
887
|
+
return self._create_child_frame(new_node_id)
|
|
888
|
+
|
|
889
|
+
def group_by(self, *by, description: str = None, maintain_order=False, **named_by) -> GroupByFrame:
|
|
890
|
+
"""
|
|
891
|
+
Start a group by operation.
|
|
892
|
+
|
|
893
|
+
Parameters:
|
|
894
|
+
*by: Column names or expressions to group by
|
|
895
|
+
description: add optional description to this step for the frontend
|
|
896
|
+
maintain_order: Keep groups in the order they appear in the data
|
|
897
|
+
**named_by: Additional columns to group by with custom names
|
|
898
|
+
|
|
899
|
+
Returns:
|
|
900
|
+
GroupByFrame object for aggregations
|
|
901
|
+
"""
|
|
902
|
+
# Process positional arguments
|
|
903
|
+
new_node_id = generate_node_id()
|
|
904
|
+
by_cols = []
|
|
905
|
+
for col_expr in by:
|
|
906
|
+
if isinstance(col_expr, str):
|
|
907
|
+
by_cols.append(col_expr)
|
|
908
|
+
elif isinstance(col_expr, Expr):
|
|
909
|
+
by_cols.append(col_expr)
|
|
910
|
+
elif isinstance(col_expr, Selector):
|
|
911
|
+
by_cols.append(col_expr)
|
|
912
|
+
elif isinstance(col_expr, (list, tuple)):
|
|
913
|
+
by_cols.extend(col_expr)
|
|
914
|
+
|
|
915
|
+
for new_name, col_expr in named_by.items():
|
|
916
|
+
if isinstance(col_expr, str):
|
|
917
|
+
by_cols.append(col(col_expr).alias(new_name))
|
|
918
|
+
elif isinstance(col_expr, Expr):
|
|
919
|
+
by_cols.append(col_expr.alias(new_name))
|
|
920
|
+
|
|
921
|
+
# Create a GroupByFrame
|
|
922
|
+
return GroupByFrame(
|
|
923
|
+
node_id=new_node_id,
|
|
924
|
+
parent_frame=self, by_cols=by_cols, maintain_order=maintain_order, description=description
|
|
925
|
+
)
|
|
926
|
+
|
|
927
|
+
def to_graph(self):
|
|
928
|
+
"""Get the underlying ETL graph."""
|
|
929
|
+
return self.flow_graph
|
|
930
|
+
|
|
931
|
+
def save_graph(self, file_path: str, auto_arrange: bool = True):
|
|
932
|
+
"""Save the graph """
|
|
933
|
+
if auto_arrange:
|
|
934
|
+
self.flow_graph.apply_layout()
|
|
935
|
+
self.flow_graph.save_flow(file_path)
|
|
936
|
+
|
|
937
|
+
def collect(self):
|
|
938
|
+
"""Collect lazy data into memory."""
|
|
939
|
+
if hasattr(self.data, "collect"):
|
|
940
|
+
return self.data.collect()
|
|
941
|
+
return self.data
|
|
942
|
+
|
|
943
|
+
def _with_flowfile_formula(self, flowfile_formula: str, output_column_name, description: str = None) -> "FlowFrame":
|
|
944
|
+
new_node_id = generate_node_id()
|
|
945
|
+
function_settings = (
|
|
946
|
+
input_schema.NodeFormula(flow_id=self.flow_graph.flow_id, node_id=new_node_id, depending_on_id=self.node_id,
|
|
947
|
+
function=transform_schema.FunctionInput(
|
|
948
|
+
function=flowfile_formula,
|
|
949
|
+
field=transform_schema.FieldInput(name=output_column_name)),
|
|
950
|
+
description=description))
|
|
951
|
+
self.flow_graph.add_formula(function_settings)
|
|
952
|
+
return self._create_child_frame(new_node_id)
|
|
953
|
+
|
|
954
|
+
def head(self, n: int, description: str = None):
|
|
955
|
+
new_node_id = generate_node_id()
|
|
956
|
+
settings = input_schema.NodeSample(flow_id=self.flow_graph.flow_id,
|
|
957
|
+
node_id=new_node_id,
|
|
958
|
+
depending_on_id=self.node_id,
|
|
959
|
+
sample_size=n,
|
|
960
|
+
description=description
|
|
961
|
+
)
|
|
962
|
+
self.flow_graph.add_sample(settings)
|
|
963
|
+
return self._create_child_frame(new_node_id)
|
|
964
|
+
|
|
965
|
+
def limit(self, n: int, description: str = None):
|
|
966
|
+
return self.head(n, description)
|
|
967
|
+
|
|
968
|
+
def cache(self) -> "FlowFrame":
|
|
969
|
+
setting_input = self.get_node_settings().setting_input
|
|
970
|
+
setting_input.cache_results = True
|
|
971
|
+
self.data.cache()
|
|
972
|
+
return self
|
|
973
|
+
|
|
974
|
+
def get_node_settings(self) -> FlowNode:
|
|
975
|
+
return self.flow_graph.get_node(self.node_id)
|
|
976
|
+
|
|
977
|
+
def pivot(self,
|
|
978
|
+
on: str | list[str],
|
|
979
|
+
*,
|
|
980
|
+
index: str | list[str] | None = None,
|
|
981
|
+
values: str | list[str] | None = None,
|
|
982
|
+
aggregate_function: str | None = "first",
|
|
983
|
+
maintain_order: bool = True,
|
|
984
|
+
sort_columns: bool = False,
|
|
985
|
+
separator: str = '_',
|
|
986
|
+
description: str = None) -> "FlowFrame":
|
|
987
|
+
"""
|
|
988
|
+
Pivot a DataFrame from long to wide format.
|
|
989
|
+
|
|
990
|
+
Parameters
|
|
991
|
+
----------
|
|
992
|
+
on: str | list[str]
|
|
993
|
+
Column values to use as column names in the pivoted DataFrame
|
|
994
|
+
index: str | list[str] | None
|
|
995
|
+
Column(s) to use as index/row identifiers in the pivoted DataFrame
|
|
996
|
+
values: str | list[str] | None
|
|
997
|
+
Column(s) that contain the values of the pivoted DataFrame
|
|
998
|
+
aggregate_function: str | None
|
|
999
|
+
Function to aggregate values if there are duplicate entries.
|
|
1000
|
+
Options: 'first', 'last', 'min', 'max', 'sum', 'mean', 'median', 'count'
|
|
1001
|
+
maintain_order: bool
|
|
1002
|
+
Whether to maintain the order of the columns/rows as they appear in the source
|
|
1003
|
+
sort_columns: bool
|
|
1004
|
+
Whether to sort the output columns
|
|
1005
|
+
separator: str
|
|
1006
|
+
Separator to use when joining column levels in the pivoted DataFrame
|
|
1007
|
+
description: str
|
|
1008
|
+
Description of this operation for the ETL graph
|
|
1009
|
+
|
|
1010
|
+
Returns
|
|
1011
|
+
-------
|
|
1012
|
+
FlowFrame
|
|
1013
|
+
A new FlowFrame with pivoted data
|
|
1014
|
+
"""
|
|
1015
|
+
new_node_id = generate_node_id()
|
|
1016
|
+
|
|
1017
|
+
# Handle input standardization
|
|
1018
|
+
on_value = on[0] if isinstance(on, list) and len(on) == 1 else on
|
|
1019
|
+
|
|
1020
|
+
# Create index_columns list
|
|
1021
|
+
if index is None:
|
|
1022
|
+
index_columns = []
|
|
1023
|
+
elif isinstance(index, str):
|
|
1024
|
+
index_columns = [index]
|
|
1025
|
+
else:
|
|
1026
|
+
index_columns = list(index)
|
|
1027
|
+
|
|
1028
|
+
# Set values column
|
|
1029
|
+
if values is None:
|
|
1030
|
+
raise ValueError("Values parameter must be specified for pivot operation")
|
|
1031
|
+
|
|
1032
|
+
value_col = values if isinstance(values, str) else values[0]
|
|
1033
|
+
|
|
1034
|
+
# Set valid aggregations
|
|
1035
|
+
valid_aggs = ['first', 'last', 'min', 'max', 'sum', 'mean', 'median', 'count']
|
|
1036
|
+
if aggregate_function not in valid_aggs:
|
|
1037
|
+
raise ValueError(f"Invalid aggregate_function: {aggregate_function}. "
|
|
1038
|
+
f"Must be one of: {', '.join(valid_aggs)}")
|
|
1039
|
+
|
|
1040
|
+
# Check if we can use the native implementation
|
|
1041
|
+
can_use_native = (
|
|
1042
|
+
isinstance(on_value, str) and
|
|
1043
|
+
isinstance(value_col, str) and
|
|
1044
|
+
aggregate_function in valid_aggs
|
|
1045
|
+
)
|
|
1046
|
+
|
|
1047
|
+
if can_use_native:
|
|
1048
|
+
# Create pivot input for native implementation
|
|
1049
|
+
pivot_input = transform_schema.PivotInput(
|
|
1050
|
+
index_columns=index_columns,
|
|
1051
|
+
pivot_column=on_value,
|
|
1052
|
+
value_col=value_col,
|
|
1053
|
+
aggregations=[aggregate_function]
|
|
1054
|
+
)
|
|
1055
|
+
|
|
1056
|
+
# Create node settings
|
|
1057
|
+
pivot_settings = input_schema.NodePivot(
|
|
1058
|
+
flow_id=self.flow_graph.flow_id,
|
|
1059
|
+
node_id=new_node_id,
|
|
1060
|
+
pivot_input=pivot_input,
|
|
1061
|
+
pos_x=200,
|
|
1062
|
+
pos_y=150,
|
|
1063
|
+
is_setup=True,
|
|
1064
|
+
depending_on_id=self.node_id,
|
|
1065
|
+
description=description or f"Pivot {value_col} by {on_value}"
|
|
1066
|
+
)
|
|
1067
|
+
|
|
1068
|
+
# Add to graph using native implementation
|
|
1069
|
+
self.flow_graph.add_pivot(pivot_settings)
|
|
1070
|
+
else:
|
|
1071
|
+
# Fall back to polars code for complex cases
|
|
1072
|
+
# Generate proper polars code
|
|
1073
|
+
on_repr = repr(on)
|
|
1074
|
+
index_repr = repr(index)
|
|
1075
|
+
values_repr = repr(values)
|
|
1076
|
+
|
|
1077
|
+
code = f"""
|
|
1078
|
+
# Perform pivot operation
|
|
1079
|
+
result = input_df.pivot(
|
|
1080
|
+
on={on_repr},
|
|
1081
|
+
index={index_repr},
|
|
1082
|
+
values={values_repr},
|
|
1083
|
+
aggregate_function='{aggregate_function}',
|
|
1084
|
+
maintain_order={maintain_order},
|
|
1085
|
+
sort_columns={sort_columns},
|
|
1086
|
+
separator="{separator}"
|
|
1087
|
+
)
|
|
1088
|
+
result
|
|
1089
|
+
"""
|
|
1090
|
+
# Generate description if not provided
|
|
1091
|
+
if description is None:
|
|
1092
|
+
on_str = on if isinstance(on, str) else ", ".join(on if isinstance(on, list) else [on])
|
|
1093
|
+
values_str = values if isinstance(values, str) else ", ".join(
|
|
1094
|
+
values if isinstance(values, list) else [values])
|
|
1095
|
+
description = f"Pivot {values_str} by {on_str}"
|
|
1096
|
+
|
|
1097
|
+
# Add polars code node
|
|
1098
|
+
self._add_polars_code(new_node_id, code, description)
|
|
1099
|
+
|
|
1100
|
+
return self._create_child_frame(new_node_id)
|
|
1101
|
+
|
|
1102
|
+
def unpivot(self,
|
|
1103
|
+
on: list[str | Selector] | str | None | Selector = None,
|
|
1104
|
+
*,
|
|
1105
|
+
index: list[str] | str | None = None,
|
|
1106
|
+
variable_name: str = "variable",
|
|
1107
|
+
value_name: str = "value",
|
|
1108
|
+
description: str = None) -> "FlowFrame":
|
|
1109
|
+
"""
|
|
1110
|
+
Unpivot a DataFrame from wide to long format.
|
|
1111
|
+
|
|
1112
|
+
Parameters
|
|
1113
|
+
----------
|
|
1114
|
+
on : list[str | Selector] | str | None | Selector
|
|
1115
|
+
Column(s) to unpivot (become values in the value column)
|
|
1116
|
+
If None, all columns not in index will be used
|
|
1117
|
+
index : list[str] | str | None
|
|
1118
|
+
Column(s) to use as identifier variables (stay as columns)
|
|
1119
|
+
variable_name : str, optional
|
|
1120
|
+
Name to give to the variable column, by default "variable"
|
|
1121
|
+
value_name : str, optional
|
|
1122
|
+
Name to give to the value column, by default "value"
|
|
1123
|
+
description : str, optional
|
|
1124
|
+
Description of this operation for the ETL graph
|
|
1125
|
+
|
|
1126
|
+
Returns
|
|
1127
|
+
-------
|
|
1128
|
+
FlowFrame
|
|
1129
|
+
A new FlowFrame with unpivoted data
|
|
1130
|
+
"""
|
|
1131
|
+
new_node_id = generate_node_id()
|
|
1132
|
+
|
|
1133
|
+
# Standardize inputs
|
|
1134
|
+
if index is None:
|
|
1135
|
+
index_columns = []
|
|
1136
|
+
elif isinstance(index, str):
|
|
1137
|
+
index_columns = [index]
|
|
1138
|
+
else:
|
|
1139
|
+
index_columns = list(index)
|
|
1140
|
+
can_use_native = True
|
|
1141
|
+
if on is None:
|
|
1142
|
+
value_columns = []
|
|
1143
|
+
elif isinstance(on, (str, Selector)):
|
|
1144
|
+
if isinstance(on, Selector):
|
|
1145
|
+
can_use_native = False
|
|
1146
|
+
value_columns = [on]
|
|
1147
|
+
elif isinstance(on, Iterable):
|
|
1148
|
+
value_columns = list(on)
|
|
1149
|
+
if isinstance(value_columns[0], Iterable):
|
|
1150
|
+
can_use_native = False
|
|
1151
|
+
else:
|
|
1152
|
+
value_columns = [on]
|
|
1153
|
+
|
|
1154
|
+
if can_use_native:
|
|
1155
|
+
can_use_native = (variable_name == "variable" and value_name == "value")
|
|
1156
|
+
if can_use_native:
|
|
1157
|
+
unpivot_input = transform_schema.UnpivotInput(
|
|
1158
|
+
index_columns=index_columns,
|
|
1159
|
+
value_columns=value_columns,
|
|
1160
|
+
data_type_selector=None,
|
|
1161
|
+
data_type_selector_mode='column'
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
# Create node settings
|
|
1165
|
+
unpivot_settings = input_schema.NodeUnpivot(
|
|
1166
|
+
flow_id=self.flow_graph.flow_id,
|
|
1167
|
+
node_id=new_node_id,
|
|
1168
|
+
unpivot_input=unpivot_input,
|
|
1169
|
+
pos_x=200,
|
|
1170
|
+
pos_y=150,
|
|
1171
|
+
is_setup=True,
|
|
1172
|
+
depending_on_id=self.node_id,
|
|
1173
|
+
description=description or "Unpivot data from wide to long format"
|
|
1174
|
+
)
|
|
1175
|
+
|
|
1176
|
+
# Add to graph using native implementation
|
|
1177
|
+
self.flow_graph.add_unpivot(unpivot_settings)
|
|
1178
|
+
else:
|
|
1179
|
+
# Fall back to polars code for complex cases
|
|
1180
|
+
|
|
1181
|
+
# Generate proper polars code
|
|
1182
|
+
on_repr = repr(on)
|
|
1183
|
+
index_repr = repr(index)
|
|
1184
|
+
|
|
1185
|
+
# Using unpivot() method to match polars API
|
|
1186
|
+
code = f"""
|
|
1187
|
+
# Perform unpivot operation
|
|
1188
|
+
output_df = input_df.unpivot(
|
|
1189
|
+
on={on_repr},
|
|
1190
|
+
index={index_repr},
|
|
1191
|
+
variable_name="{variable_name}",
|
|
1192
|
+
value_name="{value_name}"
|
|
1193
|
+
)
|
|
1194
|
+
output_df
|
|
1195
|
+
"""
|
|
1196
|
+
# Generate description if not provided
|
|
1197
|
+
if description is None:
|
|
1198
|
+
index_str = ", ".join(index_columns) if index_columns else "none"
|
|
1199
|
+
value_str = ", ".join(value_columns) if value_columns else "all non-index columns"
|
|
1200
|
+
description = f"Unpivot data with index: {index_str} and value cols: {value_str}"
|
|
1201
|
+
|
|
1202
|
+
# Add polars code node
|
|
1203
|
+
self._add_polars_code(new_node_id, code, description)
|
|
1204
|
+
|
|
1205
|
+
return self._create_child_frame(new_node_id)
|
|
1206
|
+
|
|
1207
|
+
def concat(
|
|
1208
|
+
self,
|
|
1209
|
+
other: "FlowFrame" | List["FlowFrame"],
|
|
1210
|
+
how: str = "vertical",
|
|
1211
|
+
rechunk: bool = False,
|
|
1212
|
+
parallel: bool = True,
|
|
1213
|
+
description: str = None,
|
|
1214
|
+
) -> "FlowFrame":
|
|
1215
|
+
"""
|
|
1216
|
+
Combine multiple FlowFrames into a single FlowFrame.
|
|
1217
|
+
|
|
1218
|
+
This is equivalent to Polars' concat operation with various joining strategies.
|
|
1219
|
+
|
|
1220
|
+
Parameters
|
|
1221
|
+
----------
|
|
1222
|
+
other : FlowFrame or List[FlowFrame]
|
|
1223
|
+
One or more FlowFrames to concatenate with this one
|
|
1224
|
+
how : str, default 'vertical'
|
|
1225
|
+
How to combine the FlowFrames:
|
|
1226
|
+
- 'vertical': Stack frames on top of each other (equivalent to 'union all')
|
|
1227
|
+
- 'vertical_relaxed': Same as vertical but coerces columns to common supertypes
|
|
1228
|
+
- 'diagonal': Union of column schemas, filling missing values with null
|
|
1229
|
+
- 'diagonal_relaxed': Same as diagonal but coerces columns to common supertypes
|
|
1230
|
+
- 'horizontal': Stack horizontally (column-wise concat)
|
|
1231
|
+
- 'align', 'align_full', 'align_left', 'align_right': Auto-determine key columns
|
|
1232
|
+
rechunk : bool, default False
|
|
1233
|
+
Whether to ensure contiguous memory in result
|
|
1234
|
+
parallel : bool, default True
|
|
1235
|
+
Whether to use parallel processing for the operation
|
|
1236
|
+
description : str, optional
|
|
1237
|
+
Description of this operation for the ETL graph
|
|
1238
|
+
|
|
1239
|
+
Returns
|
|
1240
|
+
-------
|
|
1241
|
+
FlowFrame
|
|
1242
|
+
A new FlowFrame with the concatenated data
|
|
1243
|
+
"""
|
|
1244
|
+
new_node_id = generate_node_id()
|
|
1245
|
+
|
|
1246
|
+
# Convert single FlowFrame to list
|
|
1247
|
+
if isinstance(other, FlowFrame):
|
|
1248
|
+
others = [other]
|
|
1249
|
+
else:
|
|
1250
|
+
others = other
|
|
1251
|
+
|
|
1252
|
+
use_native = how == "diagonal_relaxed" and parallel and not rechunk
|
|
1253
|
+
|
|
1254
|
+
if use_native:
|
|
1255
|
+
# Create union input for the transform schema
|
|
1256
|
+
union_input = transform_schema.UnionInput(
|
|
1257
|
+
mode="relaxed" # This maps to diagonal_relaxed in polars
|
|
1258
|
+
)
|
|
1259
|
+
|
|
1260
|
+
# Create node settings
|
|
1261
|
+
union_settings = input_schema.NodeUnion(
|
|
1262
|
+
flow_id=self.flow_graph.flow_id,
|
|
1263
|
+
node_id=new_node_id,
|
|
1264
|
+
union_input=union_input,
|
|
1265
|
+
pos_x=200,
|
|
1266
|
+
pos_y=150,
|
|
1267
|
+
is_setup=True,
|
|
1268
|
+
depending_on_ids=[self.node_id] + [frame.node_id for frame in others],
|
|
1269
|
+
description=description or "Concatenate dataframes",
|
|
1270
|
+
)
|
|
1271
|
+
|
|
1272
|
+
# Add to graph
|
|
1273
|
+
self.flow_graph.add_union(union_settings)
|
|
1274
|
+
|
|
1275
|
+
# Add connections
|
|
1276
|
+
self._add_connection(self.node_id, new_node_id, "main")
|
|
1277
|
+
for other_frame in others:
|
|
1278
|
+
other_frame._add_connection(other_frame.node_id, new_node_id, "main")
|
|
1279
|
+
else:
|
|
1280
|
+
# Fall back to Polars code for other cases
|
|
1281
|
+
# Create a list of input dataframes for the code
|
|
1282
|
+
input_vars = ["input_df_1"]
|
|
1283
|
+
for i in range(len(others)):
|
|
1284
|
+
input_vars.append(f"input_df_{i+2}")
|
|
1285
|
+
|
|
1286
|
+
frames_list = f"[{', '.join(input_vars)}]"
|
|
1287
|
+
|
|
1288
|
+
code = f"""
|
|
1289
|
+
# Perform concat operation
|
|
1290
|
+
output_df = pl.concat(
|
|
1291
|
+
{frames_list},
|
|
1292
|
+
how='{how}',
|
|
1293
|
+
rechunk={rechunk},
|
|
1294
|
+
parallel={parallel}
|
|
1295
|
+
)
|
|
1296
|
+
"""
|
|
1297
|
+
|
|
1298
|
+
|
|
1299
|
+
# Add polars code node with dependencies on all input frames
|
|
1300
|
+
depending_on_ids = [self.node_id] + [frame.node_id for frame in others]
|
|
1301
|
+
self._add_polars_code(
|
|
1302
|
+
new_node_id, code, description, depending_on_ids=depending_on_ids
|
|
1303
|
+
)
|
|
1304
|
+
|
|
1305
|
+
# Add connections to ensure all frames are available
|
|
1306
|
+
self._add_connection(self.node_id, new_node_id, "main")
|
|
1307
|
+
for other_frame in others:
|
|
1308
|
+
other_frame._add_connection(other_frame.node_id, new_node_id, "main")
|
|
1309
|
+
|
|
1310
|
+
# Create and return the new frame
|
|
1311
|
+
return FlowFrame(
|
|
1312
|
+
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
1313
|
+
flow_graph=self.flow_graph,
|
|
1314
|
+
node_id=new_node_id,
|
|
1315
|
+
parent_node_id=self.node_id,
|
|
1316
|
+
)
|
|
1317
|
+
|
|
1318
|
+
def _detect_cum_count_record_id(
|
|
1319
|
+
self, expr: Any, new_node_id: int, description: Optional[str] = None
|
|
1320
|
+
) -> Tuple[bool, Optional["FlowFrame"]]:
|
|
1321
|
+
"""
|
|
1322
|
+
Detect if the expression is a cum_count operation and use record_id if possible.
|
|
1323
|
+
|
|
1324
|
+
Parameters
|
|
1325
|
+
----------
|
|
1326
|
+
expr : Any
|
|
1327
|
+
Expression to analyze
|
|
1328
|
+
new_node_id : int
|
|
1329
|
+
Node ID to use if creating a record_id node
|
|
1330
|
+
description : str, optional
|
|
1331
|
+
Description to use for the new node
|
|
1332
|
+
|
|
1333
|
+
Returns
|
|
1334
|
+
-------
|
|
1335
|
+
Tuple[bool, Optional[FlowFrame]]
|
|
1336
|
+
A tuple containing:
|
|
1337
|
+
- bool: Whether a cum_count expression was detected and optimized
|
|
1338
|
+
- Optional[FlowFrame]: The new FlowFrame if detection was successful, otherwise None
|
|
1339
|
+
"""
|
|
1340
|
+
# Check if this is a cum_count operation
|
|
1341
|
+
if (not isinstance(expr, Expr) or not expr._repr_str
|
|
1342
|
+
or "cum_count" not in expr._repr_str or not hasattr(expr, "name")):
|
|
1343
|
+
return False, None
|
|
1344
|
+
|
|
1345
|
+
# Extract the output name
|
|
1346
|
+
output_name = expr.name
|
|
1347
|
+
|
|
1348
|
+
if ".over(" not in expr._repr_str:
|
|
1349
|
+
# Simple cumulative count can be implemented as a record ID with offset=1
|
|
1350
|
+
record_id_input = transform_schema.RecordIdInput(
|
|
1351
|
+
output_column_name=output_name,
|
|
1352
|
+
offset=1,
|
|
1353
|
+
group_by=False,
|
|
1354
|
+
group_by_columns=[],
|
|
1355
|
+
)
|
|
1356
|
+
|
|
1357
|
+
# Create node settings
|
|
1358
|
+
record_id_settings = input_schema.NodeRecordId(
|
|
1359
|
+
flow_id=self.flow_graph.flow_id,
|
|
1360
|
+
node_id=new_node_id,
|
|
1361
|
+
record_id_input=record_id_input,
|
|
1362
|
+
pos_x=200,
|
|
1363
|
+
pos_y=150,
|
|
1364
|
+
is_setup=True,
|
|
1365
|
+
depending_on_id=self.node_id,
|
|
1366
|
+
description=description or f"Add cumulative count as '{output_name}'",
|
|
1367
|
+
)
|
|
1368
|
+
|
|
1369
|
+
# Add to graph using native implementation
|
|
1370
|
+
self.flow_graph.add_record_id(record_id_settings)
|
|
1371
|
+
return True, self._create_child_frame(new_node_id)
|
|
1372
|
+
|
|
1373
|
+
# Check for windowed/partitioned cum_count
|
|
1374
|
+
elif ".over(" in expr._repr_str:
|
|
1375
|
+
# Try to extract partition columns from different patterns
|
|
1376
|
+
partition_columns = []
|
|
1377
|
+
|
|
1378
|
+
# Case 1: Simple string column - .over('column')
|
|
1379
|
+
simple_match = re.search(r'\.over\([\'"]([^\'"]+)[\'"]\)', expr._repr_str)
|
|
1380
|
+
if simple_match:
|
|
1381
|
+
partition_columns = [simple_match.group(1)]
|
|
1382
|
+
|
|
1383
|
+
# Case 2: List of column strings - .over(['col1', 'col2'])
|
|
1384
|
+
list_match = re.search(r"\.over\(\[(.*?)\]", expr._repr_str)
|
|
1385
|
+
if list_match:
|
|
1386
|
+
items = list_match.group(1).split(",")
|
|
1387
|
+
for item in items:
|
|
1388
|
+
# Extract string column names from quoted strings
|
|
1389
|
+
col_match = re.search(r'[\'"]([^\'"]+)[\'"]', item.strip())
|
|
1390
|
+
if col_match:
|
|
1391
|
+
partition_columns.append(col_match.group(1))
|
|
1392
|
+
|
|
1393
|
+
# Case 3: pl.col expressions - .over(pl.col('category'), pl.col('abc'))
|
|
1394
|
+
col_matches = re.finditer(r'pl\.col\([\'"]([^\'"]+)[\'"]\)', expr._repr_str)
|
|
1395
|
+
for match in col_matches:
|
|
1396
|
+
partition_columns.append(match.group(1))
|
|
1397
|
+
|
|
1398
|
+
# If we found partition columns, create a grouped record ID
|
|
1399
|
+
if partition_columns:
|
|
1400
|
+
# Use grouped record ID implementation
|
|
1401
|
+
record_id_input = transform_schema.RecordIdInput(
|
|
1402
|
+
output_column_name=output_name,
|
|
1403
|
+
offset=1,
|
|
1404
|
+
group_by=True,
|
|
1405
|
+
group_by_columns=partition_columns,
|
|
1406
|
+
)
|
|
1407
|
+
|
|
1408
|
+
# Create node settings
|
|
1409
|
+
record_id_settings = input_schema.NodeRecordId(
|
|
1410
|
+
flow_id=self.flow_graph.flow_id,
|
|
1411
|
+
node_id=new_node_id,
|
|
1412
|
+
record_id_input=record_id_input,
|
|
1413
|
+
pos_x=200,
|
|
1414
|
+
pos_y=150,
|
|
1415
|
+
is_setup=True,
|
|
1416
|
+
depending_on_id=self.node_id,
|
|
1417
|
+
description=description
|
|
1418
|
+
or f"Add grouped cumulative count as '{output_name}' by {', '.join(partition_columns)}",
|
|
1419
|
+
)
|
|
1420
|
+
|
|
1421
|
+
# Add to graph using native implementation
|
|
1422
|
+
self.flow_graph.add_record_id(record_id_settings)
|
|
1423
|
+
return True, self._create_child_frame(new_node_id)
|
|
1424
|
+
|
|
1425
|
+
# Not a cum_count we can optimize
|
|
1426
|
+
return False, None
|
|
1427
|
+
|
|
1428
|
+
def with_columns(
|
|
1429
|
+
self,
|
|
1430
|
+
exprs: Expr | List[Expr | None] = None,
|
|
1431
|
+
*,
|
|
1432
|
+
flowfile_formulas: Optional[List[str]] = None,
|
|
1433
|
+
output_column_names: Optional[List[str]] = None,
|
|
1434
|
+
description: Optional[str] = None,
|
|
1435
|
+
) -> "FlowFrame":
|
|
1436
|
+
"""
|
|
1437
|
+
Add multiple columns to the DataFrame.
|
|
1438
|
+
|
|
1439
|
+
Parameters
|
|
1440
|
+
----------
|
|
1441
|
+
exprs : Expr or List[Expr], optional
|
|
1442
|
+
Expressions to evaluate as new columns
|
|
1443
|
+
flowfile_formulas : List[str], optional
|
|
1444
|
+
Alternative approach using flowfile formula syntax
|
|
1445
|
+
output_column_names : List[str], optional
|
|
1446
|
+
Column names for the flowfile formulas
|
|
1447
|
+
description : str, optional
|
|
1448
|
+
Description of this operation for the ETL graph
|
|
1449
|
+
|
|
1450
|
+
Returns
|
|
1451
|
+
-------
|
|
1452
|
+
FlowFrame
|
|
1453
|
+
A new FlowFrame with the columns added
|
|
1454
|
+
|
|
1455
|
+
Raises
|
|
1456
|
+
------
|
|
1457
|
+
ValueError
|
|
1458
|
+
If neither exprs nor flowfile_formulas with output_column_names are provided,
|
|
1459
|
+
or if the lengths of flowfile_formulas and output_column_names don't match
|
|
1460
|
+
"""
|
|
1461
|
+
if exprs is not None:
|
|
1462
|
+
new_node_id = generate_node_id()
|
|
1463
|
+
exprs_iterable = _parse_inputs_as_iterable((exprs,))
|
|
1464
|
+
|
|
1465
|
+
if len(exprs_iterable) == 1:
|
|
1466
|
+
detected, result = self._detect_cum_count_record_id(
|
|
1467
|
+
exprs_iterable[0], new_node_id, description
|
|
1468
|
+
)
|
|
1469
|
+
if detected:
|
|
1470
|
+
return result
|
|
1471
|
+
all_expressions = []
|
|
1472
|
+
for expression in exprs_iterable:
|
|
1473
|
+
if not isinstance(expression, (Expr, Column)):
|
|
1474
|
+
all_expressions.append(lit(expression))
|
|
1475
|
+
else:
|
|
1476
|
+
all_expressions.append(expression)
|
|
1477
|
+
|
|
1478
|
+
code = (
|
|
1479
|
+
f"input_df.with_columns({', '.join(str(e) for e in all_expressions)})"
|
|
1480
|
+
)
|
|
1481
|
+
self._add_polars_code(new_node_id, code, description)
|
|
1482
|
+
return self._create_child_frame(new_node_id)
|
|
1483
|
+
|
|
1484
|
+
elif flowfile_formulas is not None and output_column_names is not None:
|
|
1485
|
+
if len(output_column_names) != len(flowfile_formulas):
|
|
1486
|
+
raise ValueError(
|
|
1487
|
+
"Length of both the formulas and the output columns names must be identical"
|
|
1488
|
+
)
|
|
1489
|
+
|
|
1490
|
+
if len(flowfile_formulas) == 1:
|
|
1491
|
+
return self._with_flowfile_formula(flowfile_formulas[0], output_column_names[0], description)
|
|
1492
|
+
ff = self
|
|
1493
|
+
for i, (flowfile_formula, output_column_name) in enumerate(zip(flowfile_formulas, output_column_names)):
|
|
1494
|
+
ff = ff._with_flowfile_formula(flowfile_formula, output_column_name, f"{i}: {description}")
|
|
1495
|
+
return ff
|
|
1496
|
+
else:
|
|
1497
|
+
raise ValueError(
|
|
1498
|
+
"Either exprs or flowfile_formulas with output_column_names must be provided"
|
|
1499
|
+
)
|
|
1500
|
+
|
|
1501
|
+
def with_row_index(
|
|
1502
|
+
self, name: str = "index", offset: int = 0, description: str = None
|
|
1503
|
+
) -> "FlowFrame":
|
|
1504
|
+
"""
|
|
1505
|
+
Add a row index as the first column in the DataFrame.
|
|
1506
|
+
|
|
1507
|
+
Parameters
|
|
1508
|
+
----------
|
|
1509
|
+
name : str, default "index"
|
|
1510
|
+
Name of the index column.
|
|
1511
|
+
offset : int, default 0
|
|
1512
|
+
Start the index at this offset. Cannot be negative.
|
|
1513
|
+
description : str, optional
|
|
1514
|
+
Description of this operation for the ETL graph
|
|
1515
|
+
|
|
1516
|
+
Returns
|
|
1517
|
+
-------
|
|
1518
|
+
FlowFrame
|
|
1519
|
+
A new FlowFrame with the row index column added
|
|
1520
|
+
"""
|
|
1521
|
+
new_node_id = generate_node_id()
|
|
1522
|
+
|
|
1523
|
+
# Check if we can use the native record_id implementation
|
|
1524
|
+
if name == "record_id" or (offset == 1 and name != "index"):
|
|
1525
|
+
# Create RecordIdInput - no grouping needed
|
|
1526
|
+
record_id_input = transform_schema.RecordIdInput(
|
|
1527
|
+
output_column_name=name,
|
|
1528
|
+
offset=offset,
|
|
1529
|
+
group_by=False,
|
|
1530
|
+
group_by_columns=[],
|
|
1531
|
+
)
|
|
1532
|
+
|
|
1533
|
+
# Create node settings
|
|
1534
|
+
record_id_settings = input_schema.NodeRecordId(
|
|
1535
|
+
flow_id=self.flow_graph.flow_id,
|
|
1536
|
+
node_id=new_node_id,
|
|
1537
|
+
record_id_input=record_id_input,
|
|
1538
|
+
pos_x=200,
|
|
1539
|
+
pos_y=150,
|
|
1540
|
+
is_setup=True,
|
|
1541
|
+
depending_on_id=self.node_id,
|
|
1542
|
+
description=description or f"Add row index column '{name}'",
|
|
1543
|
+
)
|
|
1544
|
+
|
|
1545
|
+
# Add to graph
|
|
1546
|
+
self.flow_graph.add_record_id(record_id_settings)
|
|
1547
|
+
else:
|
|
1548
|
+
# Use the polars code approach for other cases
|
|
1549
|
+
code = f"input_df.with_row_index(name='{name}', offset={offset})"
|
|
1550
|
+
self._add_polars_code(
|
|
1551
|
+
new_node_id, code, description or f"Add row index column '{name}'"
|
|
1552
|
+
)
|
|
1553
|
+
|
|
1554
|
+
return self._create_child_frame(new_node_id)
|
|
1555
|
+
|
|
1556
|
+
def explode(
|
|
1557
|
+
self,
|
|
1558
|
+
columns: str | Column | Iterable[str | Column],
|
|
1559
|
+
*more_columns: str | Column,
|
|
1560
|
+
description: str = None,
|
|
1561
|
+
) -> "FlowFrame":
|
|
1562
|
+
"""
|
|
1563
|
+
Explode the dataframe to long format by exploding the given columns.
|
|
1564
|
+
|
|
1565
|
+
The underlying columns being exploded must be of the List or Array data type.
|
|
1566
|
+
|
|
1567
|
+
Parameters
|
|
1568
|
+
----------
|
|
1569
|
+
columns : str, Column, or Sequence[str, Column]
|
|
1570
|
+
Column names, expressions, or a sequence of them to explode
|
|
1571
|
+
*more_columns : str or Column
|
|
1572
|
+
Additional columns to explode, specified as positional arguments
|
|
1573
|
+
description : str, optional
|
|
1574
|
+
Description of this operation for the ETL graph
|
|
1575
|
+
|
|
1576
|
+
Returns
|
|
1577
|
+
-------
|
|
1578
|
+
FlowFrame
|
|
1579
|
+
A new FlowFrame with exploded rows
|
|
1580
|
+
"""
|
|
1581
|
+
new_node_id = generate_node_id()
|
|
1582
|
+
|
|
1583
|
+
all_columns = []
|
|
1584
|
+
|
|
1585
|
+
if isinstance(columns, (list, tuple)):
|
|
1586
|
+
all_columns.extend(
|
|
1587
|
+
[col.name if isinstance(col, Column) else col for col in columns]
|
|
1588
|
+
)
|
|
1589
|
+
else:
|
|
1590
|
+
all_columns.append(columns.name if isinstance(columns, Column) else columns)
|
|
1591
|
+
|
|
1592
|
+
if more_columns:
|
|
1593
|
+
for col in more_columns:
|
|
1594
|
+
all_columns.append(col.name if isinstance(col, Column) else col)
|
|
1595
|
+
|
|
1596
|
+
if len(all_columns) == 1:
|
|
1597
|
+
columns_str = f"'{all_columns[0]}'"
|
|
1598
|
+
else:
|
|
1599
|
+
columns_str = "[" + ", ".join([f"'{col}'" for col in all_columns]) + "]"
|
|
1600
|
+
|
|
1601
|
+
code = f"""
|
|
1602
|
+
# Explode columns into multiple rows
|
|
1603
|
+
output_df = input_df.explode({columns_str})
|
|
1604
|
+
"""
|
|
1605
|
+
|
|
1606
|
+
cols_desc = ", ".join(all_columns)
|
|
1607
|
+
desc = description or f"Explode column(s): {cols_desc}"
|
|
1608
|
+
|
|
1609
|
+
# Add polars code node
|
|
1610
|
+
self._add_polars_code(new_node_id, code, desc)
|
|
1611
|
+
|
|
1612
|
+
return self._create_child_frame(new_node_id)
|
|
1613
|
+
|
|
1614
|
+
def text_to_rows(
|
|
1615
|
+
self,
|
|
1616
|
+
column: str | Column,
|
|
1617
|
+
output_column: str = None,
|
|
1618
|
+
delimiter: str = None,
|
|
1619
|
+
split_by_column: str = None,
|
|
1620
|
+
description: str = None,
|
|
1621
|
+
) -> "FlowFrame":
|
|
1622
|
+
"""
|
|
1623
|
+
Split text in a column into multiple rows.
|
|
1624
|
+
|
|
1625
|
+
This is equivalent to the explode operation after string splitting in Polars.
|
|
1626
|
+
|
|
1627
|
+
Parameters
|
|
1628
|
+
----------
|
|
1629
|
+
column : str or Column
|
|
1630
|
+
Column containing text to split
|
|
1631
|
+
output_column : str, optional
|
|
1632
|
+
Column name for the split values (defaults to input column name)
|
|
1633
|
+
delimiter : str, default ','
|
|
1634
|
+
String delimiter to split text on when using a fixed value
|
|
1635
|
+
split_by_column : str, optional
|
|
1636
|
+
Alternative: column name containing the delimiter for each row
|
|
1637
|
+
If provided, this overrides the delimiter parameter
|
|
1638
|
+
description : str, optional
|
|
1639
|
+
Description of this operation for the ETL graph
|
|
1640
|
+
|
|
1641
|
+
Returns
|
|
1642
|
+
-------
|
|
1643
|
+
FlowFrame
|
|
1644
|
+
A new FlowFrame with text split into multiple rows
|
|
1645
|
+
"""
|
|
1646
|
+
new_node_id = generate_node_id()
|
|
1647
|
+
|
|
1648
|
+
if isinstance(column, Column):
|
|
1649
|
+
column_name = column.name
|
|
1650
|
+
else:
|
|
1651
|
+
column_name = column
|
|
1652
|
+
|
|
1653
|
+
output_column = output_column or column_name
|
|
1654
|
+
|
|
1655
|
+
text_to_rows_input = transform_schema.TextToRowsInput(
|
|
1656
|
+
column_to_split=column_name,
|
|
1657
|
+
output_column_name=output_column,
|
|
1658
|
+
split_by_fixed_value=split_by_column is None,
|
|
1659
|
+
split_fixed_value=delimiter,
|
|
1660
|
+
split_by_column=split_by_column,
|
|
1661
|
+
)
|
|
1662
|
+
|
|
1663
|
+
# Create node settings
|
|
1664
|
+
text_to_rows_settings = input_schema.NodeTextToRows(
|
|
1665
|
+
flow_id=self.flow_graph.flow_id,
|
|
1666
|
+
node_id=new_node_id,
|
|
1667
|
+
text_to_rows_input=text_to_rows_input,
|
|
1668
|
+
pos_x=200,
|
|
1669
|
+
pos_y=150,
|
|
1670
|
+
is_setup=True,
|
|
1671
|
+
depending_on_id=self.node_id,
|
|
1672
|
+
description=description or f"Split text in '{column_name}' to rows",
|
|
1673
|
+
)
|
|
1674
|
+
|
|
1675
|
+
# Add to graph
|
|
1676
|
+
self.flow_graph.add_text_to_rows(text_to_rows_settings)
|
|
1677
|
+
|
|
1678
|
+
return self._create_child_frame(new_node_id)
|
|
1679
|
+
|
|
1680
|
+
def unique(
|
|
1681
|
+
self,
|
|
1682
|
+
subset: Union[str, "Expr", List[ Union[ str, "Expr"]]] = None,
|
|
1683
|
+
*,
|
|
1684
|
+
keep: Literal["first", "last", "any", "none"] = "any",
|
|
1685
|
+
maintain_order: bool = False,
|
|
1686
|
+
description: str = None,
|
|
1687
|
+
) -> "FlowFrame":
|
|
1688
|
+
"""
|
|
1689
|
+
Drop duplicate rows from this dataframe.
|
|
1690
|
+
|
|
1691
|
+
Parameters
|
|
1692
|
+
----------
|
|
1693
|
+
subset : str, Expr, list of str or Expr, optional
|
|
1694
|
+
Column name(s) or selector(s), to consider when identifying duplicate rows.
|
|
1695
|
+
If set to None (default), use all columns.
|
|
1696
|
+
keep : {'first', 'last', 'any', 'none'}, default 'any'
|
|
1697
|
+
Which of the duplicate rows to keep.
|
|
1698
|
+
* 'any': Does not give any guarantee of which row is kept.
|
|
1699
|
+
This allows more optimizations.
|
|
1700
|
+
* 'none': Don't keep duplicate rows.
|
|
1701
|
+
* 'first': Keep first unique row.
|
|
1702
|
+
* 'last': Keep last unique row.
|
|
1703
|
+
maintain_order : bool, default False
|
|
1704
|
+
Keep the same order as the original DataFrame. This is more expensive
|
|
1705
|
+
to compute. Settings this to True blocks the possibility to run on
|
|
1706
|
+
the streaming engine.
|
|
1707
|
+
description : str, optional
|
|
1708
|
+
Description of this operation for the ETL graph.
|
|
1709
|
+
|
|
1710
|
+
Returns
|
|
1711
|
+
-------
|
|
1712
|
+
FlowFrame
|
|
1713
|
+
DataFrame with unique rows.
|
|
1714
|
+
"""
|
|
1715
|
+
new_node_id = generate_node_id()
|
|
1716
|
+
|
|
1717
|
+
processed_subset = None
|
|
1718
|
+
can_use_native = True
|
|
1719
|
+
if subset is not None:
|
|
1720
|
+
# Convert to list if single item
|
|
1721
|
+
if not isinstance(subset, (list, tuple)):
|
|
1722
|
+
subset = [subset]
|
|
1723
|
+
|
|
1724
|
+
# Extract column names
|
|
1725
|
+
processed_subset = []
|
|
1726
|
+
for col_expr in subset:
|
|
1727
|
+
if isinstance(col_expr, str):
|
|
1728
|
+
processed_subset.append(col_expr)
|
|
1729
|
+
elif isinstance(col_expr, Column):
|
|
1730
|
+
if col_expr._select_input.is_altered:
|
|
1731
|
+
can_use_native = False
|
|
1732
|
+
break
|
|
1733
|
+
processed_subset.append(col_expr.name)
|
|
1734
|
+
else:
|
|
1735
|
+
can_use_native = False
|
|
1736
|
+
break
|
|
1737
|
+
|
|
1738
|
+
# Determine if we can use the native implementation
|
|
1739
|
+
can_use_native = (
|
|
1740
|
+
can_use_native
|
|
1741
|
+
and keep in ["any", "first", "last", "none"]
|
|
1742
|
+
and not maintain_order
|
|
1743
|
+
)
|
|
1744
|
+
|
|
1745
|
+
if can_use_native:
|
|
1746
|
+
# Use the native NodeUnique implementation
|
|
1747
|
+
unique_input = transform_schema.UniqueInput(
|
|
1748
|
+
columns=processed_subset, strategy=keep
|
|
1749
|
+
)
|
|
1750
|
+
|
|
1751
|
+
# Create node settings
|
|
1752
|
+
unique_settings = input_schema.NodeUnique(
|
|
1753
|
+
flow_id=self.flow_graph.flow_id,
|
|
1754
|
+
node_id=new_node_id,
|
|
1755
|
+
unique_input=unique_input,
|
|
1756
|
+
pos_x=200,
|
|
1757
|
+
pos_y=150,
|
|
1758
|
+
is_setup=True,
|
|
1759
|
+
depending_on_id=self.node_id,
|
|
1760
|
+
description=description or f"Get unique rows (strategy: {keep})",
|
|
1761
|
+
)
|
|
1762
|
+
|
|
1763
|
+
# Add to graph using native implementation
|
|
1764
|
+
self.flow_graph.add_unique(unique_settings)
|
|
1765
|
+
else:
|
|
1766
|
+
# Generate polars code for more complex cases
|
|
1767
|
+
if subset is None:
|
|
1768
|
+
subset_str = "None"
|
|
1769
|
+
elif isinstance(subset, (list, tuple)):
|
|
1770
|
+
# Format each item in the subset list
|
|
1771
|
+
items = []
|
|
1772
|
+
for item in subset:
|
|
1773
|
+
if isinstance(item, str):
|
|
1774
|
+
items.append(f'"{item}"')
|
|
1775
|
+
else:
|
|
1776
|
+
# For expressions, use their string representation
|
|
1777
|
+
items.append(str(item))
|
|
1778
|
+
subset_str = f"[{', '.join(items)}]"
|
|
1779
|
+
else:
|
|
1780
|
+
# Single item that's not a string
|
|
1781
|
+
subset_str = str(subset)
|
|
1782
|
+
|
|
1783
|
+
code = f"""
|
|
1784
|
+
# Remove duplicate rows
|
|
1785
|
+
output_df = input_df.unique(
|
|
1786
|
+
subset={subset_str},
|
|
1787
|
+
keep='{keep}',
|
|
1788
|
+
maintain_order={maintain_order}
|
|
1789
|
+
)
|
|
1790
|
+
"""
|
|
1791
|
+
|
|
1792
|
+
# Create descriptive text based on parameters
|
|
1793
|
+
subset_desc = "all columns" if subset is None else f"columns: {subset_str}"
|
|
1794
|
+
desc = description or f"Get unique rows using {subset_desc}, keeping {keep}"
|
|
1795
|
+
|
|
1796
|
+
# Add polars code node
|
|
1797
|
+
self._add_polars_code(new_node_id, code, desc)
|
|
1798
|
+
|
|
1799
|
+
return self._create_child_frame(new_node_id)
|
|
1800
|
+
|
|
1801
|
+
@property
|
|
1802
|
+
def columns(self) -> List[str]:
|
|
1803
|
+
"""Get the column names."""
|
|
1804
|
+
return self.data.columns
|
|
1805
|
+
|
|
1806
|
+
@property
|
|
1807
|
+
def dtypes(self) -> List[pl.DataType]:
|
|
1808
|
+
"""Get the column data types."""
|
|
1809
|
+
return self.data.dtypes
|
|
1810
|
+
|
|
1811
|
+
@property
|
|
1812
|
+
def schema(self) -> pl.schema.Schema:
|
|
1813
|
+
"""Get an ordered mapping of column names to their data type."""
|
|
1814
|
+
return self.data.schema
|
|
1815
|
+
|
|
1816
|
+
@property
|
|
1817
|
+
def width(self) -> int:
|
|
1818
|
+
"""Get the number of columns."""
|
|
1819
|
+
return self.data.width
|
|
1820
|
+
|
|
1821
|
+
|
|
1822
|
+
def _add_delegated_methods():
|
|
1823
|
+
"""Add delegated methods from polars LazyFrame."""
|
|
1824
|
+
delegate_methods = [
|
|
1825
|
+
"collect_async",
|
|
1826
|
+
"profile",
|
|
1827
|
+
"describe",
|
|
1828
|
+
"explain",
|
|
1829
|
+
"show_graph",
|
|
1830
|
+
"serialize",
|
|
1831
|
+
"fetch",
|
|
1832
|
+
"get_meta",
|
|
1833
|
+
"columns",
|
|
1834
|
+
"dtypes",
|
|
1835
|
+
"schema",
|
|
1836
|
+
"estimated_size",
|
|
1837
|
+
"n_chunks",
|
|
1838
|
+
"is_empty",
|
|
1839
|
+
"chunk_lengths",
|
|
1840
|
+
"optimization_toggle",
|
|
1841
|
+
"set_polars_options",
|
|
1842
|
+
"collect_schema"
|
|
1843
|
+
]
|
|
1844
|
+
|
|
1845
|
+
already_implemented = set(dir(FlowFrame))
|
|
1846
|
+
|
|
1847
|
+
for method_name in delegate_methods:
|
|
1848
|
+
if method_name not in already_implemented and hasattr(
|
|
1849
|
+
pl.LazyFrame, method_name
|
|
1850
|
+
):
|
|
1851
|
+
# Create a simple delegate method
|
|
1852
|
+
def make_delegate(name):
|
|
1853
|
+
def delegate_method(self, *args, **kwargs):
|
|
1854
|
+
return getattr(self.data, name)(*args, **kwargs)
|
|
1855
|
+
|
|
1856
|
+
# Set docstring and name
|
|
1857
|
+
delegate_method.__doc__ = (
|
|
1858
|
+
f"See pl.LazyFrame.{name} for full documentation."
|
|
1859
|
+
)
|
|
1860
|
+
delegate_method.__name__ = name
|
|
1861
|
+
return delegate_method
|
|
1862
|
+
|
|
1863
|
+
# Add the method to the class
|
|
1864
|
+
setattr(FlowFrame, method_name, make_delegate(method_name))
|
|
1865
|
+
|
|
1866
|
+
|
|
1867
|
+
_add_delegated_methods()
|
|
1868
|
+
|
|
1869
|
+
|
|
1870
|
+
def sum(expr):
|
|
1871
|
+
"""Sum aggregation function."""
|
|
1872
|
+
if isinstance(expr, str):
|
|
1873
|
+
expr = col(expr)
|
|
1874
|
+
return expr.sum()
|
|
1875
|
+
|
|
1876
|
+
|
|
1877
|
+
def mean(expr):
|
|
1878
|
+
"""Mean aggregation function."""
|
|
1879
|
+
if isinstance(expr, str):
|
|
1880
|
+
expr = col(expr)
|
|
1881
|
+
return expr.mean()
|
|
1882
|
+
|
|
1883
|
+
|
|
1884
|
+
def min(expr):
|
|
1885
|
+
"""Min aggregation function."""
|
|
1886
|
+
if isinstance(expr, str):
|
|
1887
|
+
expr = col(expr)
|
|
1888
|
+
return expr.min()
|
|
1889
|
+
|
|
1890
|
+
|
|
1891
|
+
def max(expr):
|
|
1892
|
+
"""Max aggregation function."""
|
|
1893
|
+
if isinstance(expr, str):
|
|
1894
|
+
expr = col(expr)
|
|
1895
|
+
return expr.max()
|
|
1896
|
+
|
|
1897
|
+
|
|
1898
|
+
def count(expr):
|
|
1899
|
+
"""Count aggregation function."""
|
|
1900
|
+
if isinstance(expr, str):
|
|
1901
|
+
expr = col(expr)
|
|
1902
|
+
return expr.count()
|
|
1903
|
+
|
|
1904
|
+
|
|
1905
|
+
def read_csv(file_path, *, flow_graph: FlowGraph = None, separator: str = ';',
|
|
1906
|
+
convert_to_absolute_path: bool = True,
|
|
1907
|
+
description: str = None, **options):
|
|
1908
|
+
"""
|
|
1909
|
+
Read a CSV file into a FlowFrame.
|
|
1910
|
+
|
|
1911
|
+
Args:
|
|
1912
|
+
file_path: Path to CSV file
|
|
1913
|
+
flow_graph: if you want to add it to an existing graph
|
|
1914
|
+
separator: Single byte character to use as separator in the file.
|
|
1915
|
+
convert_to_absolute_path: If the path needs to be set to a fixed location
|
|
1916
|
+
description: if you want to add a readable name in the frontend (advised)
|
|
1917
|
+
**options: Options for polars.read_csv
|
|
1918
|
+
|
|
1919
|
+
Returns:
|
|
1920
|
+
A FlowFrame with the CSV data
|
|
1921
|
+
"""
|
|
1922
|
+
# Create new node ID
|
|
1923
|
+
node_id = generate_node_id()
|
|
1924
|
+
if flow_graph is None:
|
|
1925
|
+
flow_graph = create_etl_graph()
|
|
1926
|
+
|
|
1927
|
+
flow_id = flow_graph.flow_id
|
|
1928
|
+
|
|
1929
|
+
has_headers = options.get('has_header', True)
|
|
1930
|
+
encoding = options.get('encoding', 'utf-8')
|
|
1931
|
+
|
|
1932
|
+
if '~' in file_path:
|
|
1933
|
+
file_path = os.path.expanduser(file_path)
|
|
1934
|
+
|
|
1935
|
+
received_table = input_schema.ReceivedTable(
|
|
1936
|
+
file_type='csv',
|
|
1937
|
+
path=file_path,
|
|
1938
|
+
name=Path(file_path).name,
|
|
1939
|
+
delimiter=separator,
|
|
1940
|
+
has_headers=has_headers,
|
|
1941
|
+
encoding=encoding
|
|
1942
|
+
)
|
|
1943
|
+
|
|
1944
|
+
if convert_to_absolute_path:
|
|
1945
|
+
received_table.path = received_table.abs_file_path
|
|
1946
|
+
|
|
1947
|
+
read_node = input_schema.NodeRead(
|
|
1948
|
+
flow_id=flow_id,
|
|
1949
|
+
node_id=node_id,
|
|
1950
|
+
received_file=received_table,
|
|
1951
|
+
pos_x=100,
|
|
1952
|
+
pos_y=100,
|
|
1953
|
+
is_setup=True
|
|
1954
|
+
)
|
|
1955
|
+
|
|
1956
|
+
flow_graph.add_read(read_node)
|
|
1957
|
+
|
|
1958
|
+
return FlowFrame(
|
|
1959
|
+
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
1960
|
+
flow_graph=flow_graph,
|
|
1961
|
+
node_id=node_id
|
|
1962
|
+
)
|
|
1963
|
+
|
|
1964
|
+
|
|
1965
|
+
def read_parquet(file_path, *, flow_graph: FlowGraph = None, description: str = None,
|
|
1966
|
+
convert_to_absolute_path: bool = True, **options) -> FlowFrame:
|
|
1967
|
+
"""
|
|
1968
|
+
Read a Parquet file into a FlowFrame.
|
|
1969
|
+
|
|
1970
|
+
Args:
|
|
1971
|
+
file_path: Path to Parquet file
|
|
1972
|
+
flow_graph: if you want to add it to an existing graph
|
|
1973
|
+
description: if you want to add a readable name in the frontend (advised)
|
|
1974
|
+
convert_to_absolute_path: If the path needs to be set to a fixed location
|
|
1975
|
+
**options: Options for polars.read_parquet
|
|
1976
|
+
|
|
1977
|
+
Returns:
|
|
1978
|
+
A FlowFrame with the Parquet data
|
|
1979
|
+
"""
|
|
1980
|
+
if '~' in file_path:
|
|
1981
|
+
file_path = os.path.expanduser(file_path)
|
|
1982
|
+
node_id = generate_node_id()
|
|
1983
|
+
|
|
1984
|
+
if flow_graph is None:
|
|
1985
|
+
flow_graph = create_etl_graph()
|
|
1986
|
+
|
|
1987
|
+
flow_id = flow_graph.flow_id
|
|
1988
|
+
|
|
1989
|
+
received_table = input_schema.ReceivedTable(
|
|
1990
|
+
file_type='parquet',
|
|
1991
|
+
path=file_path,
|
|
1992
|
+
name=Path(file_path).name,
|
|
1993
|
+
)
|
|
1994
|
+
if convert_to_absolute_path:
|
|
1995
|
+
received_table.path = received_table.abs_file_path
|
|
1996
|
+
|
|
1997
|
+
read_node = input_schema.NodeRead(
|
|
1998
|
+
flow_id=flow_id,
|
|
1999
|
+
node_id=node_id,
|
|
2000
|
+
received_file=received_table,
|
|
2001
|
+
pos_x=100,
|
|
2002
|
+
pos_y=100,
|
|
2003
|
+
is_setup=True,
|
|
2004
|
+
description=description
|
|
2005
|
+
)
|
|
2006
|
+
|
|
2007
|
+
flow_graph.add_read(read_node)
|
|
2008
|
+
|
|
2009
|
+
return FlowFrame(
|
|
2010
|
+
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
2011
|
+
flow_graph=flow_graph,
|
|
2012
|
+
node_id=node_id
|
|
2013
|
+
)
|
|
2014
|
+
|
|
2015
|
+
|
|
2016
|
+
def from_dict(data, *, flow_graph: FlowGraph = None, description: str = None) -> FlowFrame:
|
|
2017
|
+
"""
|
|
2018
|
+
Create a FlowFrame from a dictionary or list of dictionaries.
|
|
2019
|
+
|
|
2020
|
+
Args:
|
|
2021
|
+
data: Dictionary of lists or list of dictionaries
|
|
2022
|
+
flow_graph: if you want to add it to an existing graph
|
|
2023
|
+
description: if you want to add a readable name in the frontend (advised)
|
|
2024
|
+
Returns:
|
|
2025
|
+
A FlowFrame with the data
|
|
2026
|
+
"""
|
|
2027
|
+
# Create new node ID
|
|
2028
|
+
node_id = generate_node_id()
|
|
2029
|
+
|
|
2030
|
+
if not flow_graph:
|
|
2031
|
+
flow_graph = create_etl_graph()
|
|
2032
|
+
flow_id = flow_graph.flow_id
|
|
2033
|
+
|
|
2034
|
+
input_node = input_schema.NodeManualInput(
|
|
2035
|
+
flow_id=flow_id,
|
|
2036
|
+
node_id=node_id,
|
|
2037
|
+
raw_data=FlowDataEngine(data).to_pylist(),
|
|
2038
|
+
pos_x=100,
|
|
2039
|
+
pos_y=100,
|
|
2040
|
+
is_setup=True,
|
|
2041
|
+
description=description
|
|
2042
|
+
)
|
|
2043
|
+
|
|
2044
|
+
# Add to graph
|
|
2045
|
+
flow_graph.add_manual_input(input_node)
|
|
2046
|
+
|
|
2047
|
+
# Return new frame
|
|
2048
|
+
return FlowFrame(
|
|
2049
|
+
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
2050
|
+
flow_graph=flow_graph,
|
|
2051
|
+
node_id=node_id
|
|
2052
|
+
)
|
|
2053
|
+
|
|
2054
|
+
|
|
2055
|
+
def concat(frames: List['FlowFrame'],
|
|
2056
|
+
how: str = 'vertical',
|
|
2057
|
+
rechunk: bool = False,
|
|
2058
|
+
parallel: bool = True,
|
|
2059
|
+
description: str = None) -> 'FlowFrame':
|
|
2060
|
+
"""
|
|
2061
|
+
Concatenate multiple FlowFrames into one.
|
|
2062
|
+
|
|
2063
|
+
Parameters
|
|
2064
|
+
----------
|
|
2065
|
+
frames : List[FlowFrame]
|
|
2066
|
+
List of FlowFrames to concatenate
|
|
2067
|
+
how : str, default 'vertical'
|
|
2068
|
+
How to combine the FlowFrames (see concat method documentation)
|
|
2069
|
+
rechunk : bool, default False
|
|
2070
|
+
Whether to ensure contiguous memory in result
|
|
2071
|
+
parallel : bool, default True
|
|
2072
|
+
Whether to use parallel processing for the operation
|
|
2073
|
+
description : str, optional
|
|
2074
|
+
Description of this operation
|
|
2075
|
+
|
|
2076
|
+
Returns
|
|
2077
|
+
-------
|
|
2078
|
+
FlowFrame
|
|
2079
|
+
A new FlowFrame with the concatenated data
|
|
2080
|
+
"""
|
|
2081
|
+
if not frames:
|
|
2082
|
+
raise ValueError("No frames provided to concat_frames")
|
|
2083
|
+
|
|
2084
|
+
if len(frames) == 1:
|
|
2085
|
+
return frames[0]
|
|
2086
|
+
|
|
2087
|
+
# Use first frame's concat method with remaining frames
|
|
2088
|
+
first_frame = frames[0]
|
|
2089
|
+
remaining_frames = frames[1:]
|
|
2090
|
+
|
|
2091
|
+
return first_frame.concat(remaining_frames, how=how,
|
|
2092
|
+
rechunk=rechunk, parallel=parallel,
|
|
2093
|
+
description=description)
|