Flowfile 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +27 -6
- flowfile/api.py +1 -0
- flowfile/web/__init__.py +2 -2
- flowfile/web/static/assets/{CloudConnectionManager-d004942f.js → CloudConnectionManager-c20a740f.js} +3 -4
- flowfile/web/static/assets/{CloudStorageReader-eccf9fc2.js → CloudStorageReader-960b400a.js} +7 -7
- flowfile/web/static/assets/{CloudStorageWriter-b1ba6bba.js → CloudStorageWriter-e3decbdd.js} +7 -7
- flowfile/web/static/assets/{CrossJoin-68981877.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-0b06649c.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-8349a426.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-905344f8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-9f5b8638.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-131a6d53.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-e3549dcc.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-6e0730ae.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-02f033e6.js → Formula-4207ea31.js} +8 -8
- flowfile/web/static/assets/{FuzzyMatch-54c14036.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-08a3f499.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-2ae38139.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-493b9772.js → Join-4e49a274.js} +9 -9
- flowfile/web/static/assets/{ManualInput-4373d163.js → ManualInput-90998ae8.js} +5 -5
- flowfile/web/static/assets/{Output-b534f3c7.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-2968ff65.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-65136536.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-c56339ed.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-1c641a5e.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-df308b8f.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-293e8a64.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-03911655.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-3058a13d.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-fbf4fb39.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-a29bbaf7.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-c7d7760e.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-118f1d20.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-f0589571.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-7329a207.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-30b0be15.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/{api-fb67319c.js → api-6ef0dcef.js} +1 -1
- flowfile/web/static/assets/{api-602fb95c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/{designer-94a6bf4d.js → designer-13eabd83.js} +4 -4
- flowfile/web/static/assets/{documentation-a224831e.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-c2d2aa97.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-921ac5fd.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-7013cc94.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-3a75211d.js → index-f6c15e76.js} +46 -22
- flowfile/web/static/assets/{nodeTitle-a63d4680.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-763aec6e.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-08464729.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-f15a5f87.js → vue-codemirror.esm-2847001e.js} +1 -1
- flowfile/web/static/assets/{vue-content-loader.es-93bd09d7.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/METADATA +2 -2
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/RECORD +96 -94
- flowfile_core/__init__.py +1 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +1 -0
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/flowfile/code_generator/code_generator.py +71 -0
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +1 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +597 -309
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_graph.py +619 -191
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +500 -89
- flowfile_core/flowfile/flow_node/models.py +125 -20
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +36 -5
- flowfile_core/main.py +32 -13
- flowfile_core/routes/cloud_connections.py +7 -11
- flowfile_core/routes/logs.py +2 -6
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +127 -51
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/input_schema.py +92 -64
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +144 -11
- flowfile_core/schemas/transform_schema.py +82 -17
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/__init__.py +0 -0
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +28 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +232 -110
- flowfile_frame/flow_frame.pyi +140 -91
- flowfile_frame/flow_frame_methods.py +150 -12
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- test_utils/s3/data_generator.py +1 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +6 -1
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/models.py +0 -193
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
- {flowfile-0.3.6.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +0 -0
flowfile_frame/flow_frame.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Any, Iterable, List, Literal, Optional, Tuple, Union, Dict, C
|
|
|
5
5
|
import re
|
|
6
6
|
|
|
7
7
|
import polars as pl
|
|
8
|
-
|
|
8
|
+
from polars._typing import (CsvEncoding)
|
|
9
9
|
from flowfile_frame.lazy_methods import add_lazyframe_methods
|
|
10
10
|
|
|
11
11
|
from polars._typing import (FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
|
|
@@ -20,13 +20,12 @@ from flowfile_frame.expr import Expr, Column, lit, col
|
|
|
20
20
|
from flowfile_frame.selectors import Selector
|
|
21
21
|
from flowfile_frame.group_frame import GroupByFrame
|
|
22
22
|
from flowfile_frame.utils import (_parse_inputs_as_iterable, create_flow_graph, stringify_values,
|
|
23
|
-
ensure_inputs_as_iterable
|
|
23
|
+
ensure_inputs_as_iterable, generate_node_id,
|
|
24
|
+
set_node_id, data as node_id_data)
|
|
24
25
|
from flowfile_frame.join import _normalize_columns_to_list, _create_join_mappings
|
|
25
26
|
from flowfile_frame.utils import _check_if_convertible_to_code
|
|
26
27
|
from flowfile_frame.config import logger
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
node_id_counter = 0
|
|
28
|
+
from flowfile_frame.cloud_storage.frame_helpers import add_write_ff_to_cloud_storage
|
|
30
29
|
|
|
31
30
|
|
|
32
31
|
def can_be_expr(param: inspect.Parameter) -> bool:
|
|
@@ -115,12 +114,6 @@ def _check_ok_for_serialization(method_name: str = None, polars_expr: pl.Expr |
|
|
|
115
114
|
raise NotImplementedError("Cannot create a polars lambda expression without the groupby expression")
|
|
116
115
|
|
|
117
116
|
|
|
118
|
-
def generate_node_id() -> int:
|
|
119
|
-
global node_id_counter
|
|
120
|
-
node_id_counter += 1
|
|
121
|
-
return node_id_counter
|
|
122
|
-
|
|
123
|
-
|
|
124
117
|
@add_lazyframe_methods
|
|
125
118
|
class FlowFrame:
|
|
126
119
|
"""Main class that wraps FlowDataEngine and maintains the ETL graph."""
|
|
@@ -181,38 +174,41 @@ class FlowFrame:
|
|
|
181
174
|
flow_graph = create_flow_graph()
|
|
182
175
|
|
|
183
176
|
flow_id = flow_graph.flow_id
|
|
184
|
-
# Convert data to a polars DataFrame/
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
177
|
+
# Convert data to a polars DataFrame/LazyFram
|
|
178
|
+
if isinstance(data, pl.LazyFrame):
|
|
179
|
+
flow_graph.add_dependency_on_polars_lazy_frame(data.lazy(), node_id)
|
|
180
|
+
else:
|
|
181
|
+
try:
|
|
182
|
+
# Use polars to convert from various types
|
|
183
|
+
pl_df = pl.DataFrame(
|
|
184
|
+
data,
|
|
185
|
+
schema=schema,
|
|
186
|
+
schema_overrides=schema_overrides,
|
|
187
|
+
strict=strict,
|
|
188
|
+
orient=orient,
|
|
189
|
+
infer_schema_length=infer_schema_length,
|
|
190
|
+
nan_to_null=nan_to_null,
|
|
191
|
+
)
|
|
192
|
+
pl_data = pl_df.lazy()
|
|
193
|
+
except Exception as e:
|
|
194
|
+
raise ValueError(f"Could not dconvert data to a polars DataFrame: {e}")
|
|
195
|
+
# Create a FlowDataEngine to get data in the right format for manual input
|
|
196
|
+
flow_table = FlowDataEngine(raw_data=pl_data)
|
|
197
|
+
raw_data_format = input_schema.RawData(data=list(flow_table.to_dict().values()),
|
|
198
|
+
columns=[c.get_minimal_field_info() for c in flow_table.schema])
|
|
199
|
+
# Create a manual input node
|
|
200
|
+
input_node = input_schema.NodeManualInput(
|
|
201
|
+
flow_id=flow_id,
|
|
202
|
+
node_id=node_id,
|
|
203
|
+
raw_data_format=raw_data_format,
|
|
204
|
+
pos_x=100,
|
|
205
|
+
pos_y=100,
|
|
206
|
+
is_setup=True,
|
|
207
|
+
description=description,
|
|
195
208
|
)
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
# Create a FlowDataEngine to get data in the right format for manual input
|
|
200
|
-
flow_table = FlowDataEngine(raw_data=pl_data)
|
|
201
|
-
raw_data_format = input_schema.RawData(data=list(flow_table.to_dict().values()),
|
|
202
|
-
columns=[c.get_minimal_field_info() for c in flow_table.schema])
|
|
203
|
-
# Create a manual input node
|
|
204
|
-
input_node = input_schema.NodeManualInput(
|
|
205
|
-
flow_id=flow_id,
|
|
206
|
-
node_id=node_id,
|
|
207
|
-
raw_data_format=raw_data_format,
|
|
208
|
-
pos_x=100,
|
|
209
|
-
pos_y=100,
|
|
210
|
-
is_setup=True,
|
|
211
|
-
description=description,
|
|
212
|
-
)
|
|
213
|
-
# Add to graph
|
|
214
|
-
flow_graph.add_manual_input(input_node)
|
|
215
|
-
# Return new frame
|
|
209
|
+
# Add to graph
|
|
210
|
+
flow_graph.add_manual_input(input_node)
|
|
211
|
+
# Return new fram
|
|
216
212
|
return FlowFrame(
|
|
217
213
|
data=flow_graph.get_node(node_id).get_resulting_data().data_frame,
|
|
218
214
|
flow_graph=flow_graph,
|
|
@@ -221,69 +217,92 @@ class FlowFrame:
|
|
|
221
217
|
)
|
|
222
218
|
|
|
223
219
|
def __new__(
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
return cls.create_from_any_type(
|
|
241
|
-
data=data,
|
|
242
|
-
schema=schema,
|
|
243
|
-
schema_overrides=schema_overrides,
|
|
244
|
-
strict=strict,
|
|
245
|
-
orient=orient,
|
|
246
|
-
infer_schema_length=infer_schema_length,
|
|
247
|
-
nan_to_null=nan_to_null,
|
|
248
|
-
flow_graph=flow_graph,
|
|
249
|
-
node_id=node_id,
|
|
250
|
-
parent_node_id=parent_node_id,
|
|
251
|
-
)
|
|
220
|
+
cls,
|
|
221
|
+
data: pl.LazyFrame | FrameInitTypes = None,
|
|
222
|
+
schema: SchemaDefinition | None = None,
|
|
223
|
+
*,
|
|
224
|
+
schema_overrides: SchemaDict | None = None,
|
|
225
|
+
strict: bool = True,
|
|
226
|
+
orient: Orientation | None = None,
|
|
227
|
+
infer_schema_length: int | None = 100,
|
|
228
|
+
nan_to_null: bool = False,
|
|
229
|
+
flow_graph: Optional[FlowGraph] = None,
|
|
230
|
+
node_id: Optional[int] = None,
|
|
231
|
+
parent_node_id: Optional[int] = None,
|
|
232
|
+
**kwargs, # Accept and ignore any other kwargs for API compatibility
|
|
233
|
+
) -> "FlowFrame":
|
|
234
|
+
"""
|
|
235
|
+
Unified constructor for FlowFrame.
|
|
252
236
|
|
|
253
|
-
|
|
254
|
-
|
|
237
|
+
- If `flow_graph` and `node_id` are provided, it creates a lightweight Python
|
|
238
|
+
wrapper around an existing node in the graph.
|
|
239
|
+
- Otherwise, it creates a new source node in a new or existing graph
|
|
240
|
+
from the provided data.
|
|
241
|
+
"""
|
|
242
|
+
# --- Path 1: Internal Wrapper Creation ---
|
|
243
|
+
# This path is taken by methods like .join(), .sort(), etc., which provide an existing graph.
|
|
244
|
+
if flow_graph is not None and node_id is not None:
|
|
245
|
+
instance = super().__new__(cls)
|
|
246
|
+
instance.data = data
|
|
247
|
+
instance.flow_graph = flow_graph
|
|
248
|
+
instance.node_id = node_id
|
|
249
|
+
instance.parent_node_id = parent_node_id
|
|
250
|
+
return instance
|
|
251
|
+
elif flow_graph is not None and not isinstance(data, pl.LazyFrame):
|
|
252
|
+
instance = cls.create_from_any_type(data=data, schema=schema, schema_overrides=schema_overrides,
|
|
253
|
+
strict=strict, orient=orient, infer_schema_length=infer_schema_length,
|
|
254
|
+
nan_to_null=nan_to_null, flow_graph=flow_graph, node_id=node_id,
|
|
255
|
+
parent_node_id=parent_node_id
|
|
256
|
+
)
|
|
257
|
+
return instance
|
|
258
|
+
|
|
259
|
+
source_graph = create_flow_graph()
|
|
260
|
+
source_node_id = generate_node_id()
|
|
255
261
|
|
|
256
|
-
def __init__(
|
|
257
|
-
self,
|
|
258
|
-
data: pl.LazyFrame | FrameInitTypes = None,
|
|
259
|
-
schema: SchemaDefinition | None = None,
|
|
260
|
-
*,
|
|
261
|
-
schema_overrides: SchemaDict | None = None,
|
|
262
|
-
strict: bool = True,
|
|
263
|
-
orient: Orientation | None = None,
|
|
264
|
-
infer_schema_length: int | None = 100,
|
|
265
|
-
nan_to_null: bool = False,
|
|
266
|
-
flow_graph=None,
|
|
267
|
-
node_id=None,
|
|
268
|
-
parent_node_id=None,
|
|
269
|
-
):
|
|
270
|
-
"""Initialize the FlowFrame with data and graph references."""
|
|
271
262
|
if data is None:
|
|
272
263
|
data = pl.LazyFrame()
|
|
273
264
|
if not isinstance(data, pl.LazyFrame):
|
|
274
|
-
return
|
|
275
|
-
self.node_id = node_id or generate_node_id()
|
|
276
|
-
self.parent_node_id = parent_node_id
|
|
277
265
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
266
|
+
description = "Data imported from Python object"
|
|
267
|
+
try:
|
|
268
|
+
pl_df = pl.DataFrame(
|
|
269
|
+
data, schema=schema, schema_overrides=schema_overrides,
|
|
270
|
+
strict=strict, orient=orient, infer_schema_length=infer_schema_length,
|
|
271
|
+
nan_to_null=nan_to_null
|
|
272
|
+
)
|
|
273
|
+
pl_data = pl_df.lazy()
|
|
274
|
+
except Exception as e:
|
|
275
|
+
raise ValueError(f"Could not convert data to a Polars DataFrame: {e}")
|
|
276
|
+
|
|
277
|
+
flow_table = FlowDataEngine(raw_data=pl_data)
|
|
278
|
+
raw_data_format = input_schema.RawData(data=list(flow_table.to_dict().values()),
|
|
279
|
+
columns=[c.get_minimal_field_info() for c in flow_table.schema])
|
|
280
|
+
input_node = input_schema.NodeManualInput(
|
|
281
|
+
flow_id=source_graph.flow_id, node_id=source_node_id,
|
|
282
|
+
raw_data_format=raw_data_format, pos_x=100, pos_y=100,
|
|
283
|
+
is_setup=True, description=description
|
|
284
|
+
)
|
|
285
|
+
source_graph.add_manual_input(input_node)
|
|
285
286
|
else:
|
|
286
|
-
|
|
287
|
+
source_graph.add_dependency_on_polars_lazy_frame(data, source_node_id)
|
|
288
|
+
|
|
289
|
+
final_data = source_graph.get_node(source_node_id).get_resulting_data().data_frame
|
|
290
|
+
return cls(
|
|
291
|
+
data=final_data,
|
|
292
|
+
flow_graph=source_graph,
|
|
293
|
+
node_id=source_node_id,
|
|
294
|
+
parent_node_id=parent_node_id
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
def __init__(self, *args, **kwargs):
|
|
298
|
+
"""
|
|
299
|
+
The __init__ method is intentionally left empty.
|
|
300
|
+
All initialization logic is handled in the `__new__` method to support
|
|
301
|
+
the flexible factory pattern and prevent state from being overwritten.
|
|
302
|
+
Python automatically calls __init__ after __new__, so this empty
|
|
303
|
+
method catches that call and safely does nothing.
|
|
304
|
+
"""
|
|
305
|
+
pass
|
|
287
306
|
|
|
288
307
|
def __repr__(self):
|
|
289
308
|
return str(self.data)
|
|
@@ -594,7 +613,6 @@ class FlowFrame:
|
|
|
594
613
|
use_polars_code = self._should_use_polars_code_for_join(
|
|
595
614
|
maintain_order, coalesce, nulls_equal, validate, suffix
|
|
596
615
|
)
|
|
597
|
-
|
|
598
616
|
# Step 2: Ensure both FlowFrames are in the same graph
|
|
599
617
|
self._ensure_same_graph(other)
|
|
600
618
|
|
|
@@ -662,9 +680,7 @@ class FlowFrame:
|
|
|
662
680
|
other.node_id = new_other_node_id
|
|
663
681
|
self.flow_graph = combined_graph
|
|
664
682
|
other.flow_graph = combined_graph
|
|
665
|
-
|
|
666
|
-
global node_id_counter
|
|
667
|
-
node_id_counter += len(combined_graph.nodes)
|
|
683
|
+
node_id_data["c"] = node_id_data["c"] + len(combined_graph.nodes)
|
|
668
684
|
|
|
669
685
|
def _parse_join_columns(
|
|
670
686
|
self,
|
|
@@ -781,7 +797,6 @@ class FlowFrame:
|
|
|
781
797
|
# Create select inputs for both frames
|
|
782
798
|
left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
|
|
783
799
|
right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
|
|
784
|
-
|
|
785
800
|
# Create appropriate join input based on join type
|
|
786
801
|
if how == 'cross':
|
|
787
802
|
join_input = transform_schema.CrossJoinInput(
|
|
@@ -811,7 +826,6 @@ class FlowFrame:
|
|
|
811
826
|
# Add connections
|
|
812
827
|
self._add_connection(self.node_id, new_node_id, "main")
|
|
813
828
|
other._add_connection(other.node_id, new_node_id, "right")
|
|
814
|
-
|
|
815
829
|
# Create and return result frame
|
|
816
830
|
return FlowFrame(
|
|
817
831
|
data=self.flow_graph.get_node(new_node_id).get_resulting_data().data_frame,
|
|
@@ -1074,7 +1088,7 @@ class FlowFrame:
|
|
|
1074
1088
|
|
|
1075
1089
|
def write_parquet(
|
|
1076
1090
|
self,
|
|
1077
|
-
path: str|os.PathLike,
|
|
1091
|
+
path: str | os.PathLike,
|
|
1078
1092
|
*,
|
|
1079
1093
|
description: str = None,
|
|
1080
1094
|
convert_to_absolute_path: bool = True,
|
|
@@ -1244,6 +1258,117 @@ class FlowFrame:
|
|
|
1244
1258
|
|
|
1245
1259
|
return self._create_child_frame(new_node_id)
|
|
1246
1260
|
|
|
1261
|
+
def write_parquet_to_cloud_storage(self,
|
|
1262
|
+
path: str,
|
|
1263
|
+
connection_name: Optional[str] = None,
|
|
1264
|
+
compression: Literal["snappy", "gzip", "brotli", "lz4", "zstd"] = "snappy",
|
|
1265
|
+
description: Optional[str] = None,
|
|
1266
|
+
) -> "FlowFrame":
|
|
1267
|
+
"""
|
|
1268
|
+
Write the data frame to cloud storage in Parquet format.
|
|
1269
|
+
|
|
1270
|
+
Args:
|
|
1271
|
+
path (str): The destination path in cloud storage where the Parquet file will be written.
|
|
1272
|
+
connection_name (Optional[str], optional): The name of the storage connection
|
|
1273
|
+
that a user can create. If None, uses the default connection. Defaults to None.
|
|
1274
|
+
compression (Literal["snappy", "gzip", "brotli", "lz4", "zstd"], optional):
|
|
1275
|
+
The compression algorithm to use for the Parquet file. Defaults to "snappy".
|
|
1276
|
+
description (Optional[str], optional): Description of this operation for the ETL graph.
|
|
1277
|
+
|
|
1278
|
+
Returns:
|
|
1279
|
+
FlowFrame: A new child data frame representing the written data.
|
|
1280
|
+
"""
|
|
1281
|
+
|
|
1282
|
+
new_node_id = add_write_ff_to_cloud_storage(path, flow_graph=self.flow_graph,
|
|
1283
|
+
connection_name=connection_name,
|
|
1284
|
+
depends_on_node_id=self.node_id,
|
|
1285
|
+
parquet_compression=compression,
|
|
1286
|
+
file_format="parquet",
|
|
1287
|
+
description=description)
|
|
1288
|
+
return self._create_child_frame(new_node_id)
|
|
1289
|
+
|
|
1290
|
+
def write_csv_to_cloud_storage(self,
|
|
1291
|
+
path: str,
|
|
1292
|
+
connection_name: Optional[str] = None,
|
|
1293
|
+
delimiter: str = ";",
|
|
1294
|
+
encoding: CsvEncoding = "utf8",
|
|
1295
|
+
description: Optional[str] = None,
|
|
1296
|
+
) -> "FlowFrame":
|
|
1297
|
+
"""
|
|
1298
|
+
Write the data frame to cloud storage in CSV format.
|
|
1299
|
+
|
|
1300
|
+
Args:
|
|
1301
|
+
path (str): The destination path in cloud storage where the CSV file will be written.
|
|
1302
|
+
connection_name (Optional[str], optional): The name of the storage connection
|
|
1303
|
+
that a user can create. If None, uses the default connection. Defaults to None.
|
|
1304
|
+
delimiter (str, optional): The character used to separate fields in the CSV file.
|
|
1305
|
+
Defaults to ";".
|
|
1306
|
+
encoding (CsvEncoding, optional): The character encoding to use for the CSV file.
|
|
1307
|
+
Defaults to "utf8".
|
|
1308
|
+
description (Optional[str], optional): Description of this operation for the ETL graph.
|
|
1309
|
+
|
|
1310
|
+
Returns:
|
|
1311
|
+
FlowFrame: A new child data frame representing the written data.
|
|
1312
|
+
"""
|
|
1313
|
+
new_node_id = add_write_ff_to_cloud_storage(path, flow_graph=self.flow_graph,
|
|
1314
|
+
connection_name=connection_name,
|
|
1315
|
+
depends_on_node_id=self.node_id,
|
|
1316
|
+
csv_delimiter=delimiter,
|
|
1317
|
+
csv_encoding=encoding,
|
|
1318
|
+
file_format="csv",
|
|
1319
|
+
description=description)
|
|
1320
|
+
return self._create_child_frame(new_node_id)
|
|
1321
|
+
|
|
1322
|
+
def write_delta(self,
|
|
1323
|
+
path: str,
|
|
1324
|
+
connection_name: Optional[str] = None,
|
|
1325
|
+
write_mode: Literal["overwrite", "append"] = "overwrite",
|
|
1326
|
+
description: Optional[str] = None,
|
|
1327
|
+
) -> "FlowFrame":
|
|
1328
|
+
"""
|
|
1329
|
+
Write the data frame to cloud storage in Delta Lake format.
|
|
1330
|
+
|
|
1331
|
+
Args:
|
|
1332
|
+
path (str): The destination path in cloud storage where the Delta table will be written.
|
|
1333
|
+
connection_name (Optional[str], optional): The name of the storage connection
|
|
1334
|
+
that a user can create. If None, uses the default connection. Defaults to None.
|
|
1335
|
+
write_mode (Literal["overwrite", "append"], optional): The write mode for the Delta table.
|
|
1336
|
+
"overwrite" replaces existing data, "append" adds to existing data. Defaults to "overwrite".
|
|
1337
|
+
description (Optional[str], optional): Description of this operation for the ETL graph.
|
|
1338
|
+
Returns:
|
|
1339
|
+
FlowFrame: A new child data frame representing the written data.
|
|
1340
|
+
"""
|
|
1341
|
+
new_node_id = add_write_ff_to_cloud_storage(path, flow_graph=self.flow_graph,
|
|
1342
|
+
connection_name=connection_name,
|
|
1343
|
+
depends_on_node_id=self.node_id,
|
|
1344
|
+
write_mode=write_mode,
|
|
1345
|
+
file_format="delta",
|
|
1346
|
+
description=description)
|
|
1347
|
+
return self._create_child_frame(new_node_id)
|
|
1348
|
+
|
|
1349
|
+
def write_json_to_cloud_storage(self,
|
|
1350
|
+
path: str,
|
|
1351
|
+
connection_name: Optional[str] = None,
|
|
1352
|
+
description: Optional[str] = None,
|
|
1353
|
+
) -> "FlowFrame":
|
|
1354
|
+
"""
|
|
1355
|
+
Write the data frame to cloud storage in JSON format.
|
|
1356
|
+
|
|
1357
|
+
Args:
|
|
1358
|
+
path (str): The destination path in cloud storage where the JSON file will be written.
|
|
1359
|
+
connection_name (Optional[str], optional): The name of the storage connection
|
|
1360
|
+
that a user can create. If None, uses the default connection. Defaults to None.
|
|
1361
|
+
description (Optional[str], optional): Description of this operation for the ETL graph.
|
|
1362
|
+
Returns:
|
|
1363
|
+
FlowFrame: A new child data frame representing the written data.
|
|
1364
|
+
"""
|
|
1365
|
+
new_node_id = add_write_ff_to_cloud_storage(path, flow_graph=self.flow_graph,
|
|
1366
|
+
connection_name=connection_name,
|
|
1367
|
+
depends_on_node_id=self.node_id,
|
|
1368
|
+
file_format="json",
|
|
1369
|
+
description=description)
|
|
1370
|
+
return self._create_child_frame(new_node_id)
|
|
1371
|
+
|
|
1247
1372
|
def group_by(self, *by, description: str = None, maintain_order=False, **named_by) -> GroupByFrame:
|
|
1248
1373
|
"""
|
|
1249
1374
|
Start a group by operation.
|
|
@@ -1275,7 +1400,6 @@ class FlowFrame:
|
|
|
1275
1400
|
by_cols.append(col(col_expr).alias(new_name))
|
|
1276
1401
|
elif isinstance(col_expr, Expr):
|
|
1277
1402
|
by_cols.append(col_expr.alias(new_name))
|
|
1278
|
-
|
|
1279
1403
|
# Create a GroupByFrame
|
|
1280
1404
|
return GroupByFrame(
|
|
1281
1405
|
node_id=new_node_id,
|
|
@@ -1292,7 +1416,7 @@ class FlowFrame:
|
|
|
1292
1416
|
self.flow_graph.apply_layout()
|
|
1293
1417
|
self.flow_graph.save_flow(file_path)
|
|
1294
1418
|
|
|
1295
|
-
def collect(self, *args, **kwargs):
|
|
1419
|
+
def collect(self, *args, **kwargs) -> pl.DataFrame:
|
|
1296
1420
|
"""Collect lazy data into memory."""
|
|
1297
1421
|
if hasattr(self.data, "collect"):
|
|
1298
1422
|
return self.data.collect(*args, **kwargs)
|
|
@@ -1614,8 +1738,7 @@ class FlowFrame:
|
|
|
1614
1738
|
combined_graph, node_mappings = combine_flow_graphs_with_mapping(*all_graphs)
|
|
1615
1739
|
for f in [self] + other:
|
|
1616
1740
|
f.node_id = node_mappings.get((f.flow_graph.flow_id, f.node_id), None)
|
|
1617
|
-
|
|
1618
|
-
node_id_counter += len(combined_graph.nodes)
|
|
1741
|
+
node_id_data["c"] = node_id_data["c"] + len(combined_graph.nodes)
|
|
1619
1742
|
else:
|
|
1620
1743
|
combined_graph = self.flow_graph
|
|
1621
1744
|
new_node_id = generate_node_id()
|
|
@@ -1810,7 +1933,6 @@ class FlowFrame:
|
|
|
1810
1933
|
all_input_expr_objects: List[Expr] = []
|
|
1811
1934
|
pure_polars_expr_strings_for_wc: List[str] = []
|
|
1812
1935
|
collected_raw_definitions: List[str] = []
|
|
1813
|
-
|
|
1814
1936
|
has_exprs_or_named_exprs = bool(exprs or named_exprs)
|
|
1815
1937
|
if has_exprs_or_named_exprs:
|
|
1816
1938
|
actual_exprs_to_process: List[Expr] = []
|