Flowfile 0.3.2__py3-none-any.whl → 0.3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/__init__.py +3 -2
- flowfile/web/__init__.py +3 -0
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/METADATA +4 -3
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/RECORD +46 -35
- flowfile_core/configs/__init__.py +15 -4
- flowfile_core/configs/settings.py +5 -3
- flowfile_core/configs/utils.py +18 -0
- flowfile_core/flowfile/FlowfileFlow.py +13 -18
- flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +54 -17
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +2 -1
- flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
- flowfile_core/flowfile/flow_node/flow_node.py +2 -1
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
- flowfile_core/flowfile/utils.py +34 -3
- flowfile_core/main.py +2 -3
- flowfile_core/routes/secrets.py +1 -1
- flowfile_core/schemas/input_schema.py +10 -4
- flowfile_core/schemas/transform_schema.py +25 -47
- flowfile_frame/__init__.py +11 -4
- flowfile_frame/adding_expr.py +280 -0
- flowfile_frame/config.py +9 -0
- flowfile_frame/expr.py +301 -83
- flowfile_frame/expr.pyi +2174 -0
- flowfile_frame/expr_name.py +258 -0
- flowfile_frame/flow_frame.py +584 -1002
- flowfile_frame/flow_frame.pyi +368 -0
- flowfile_frame/flow_frame_methods.py +617 -0
- flowfile_frame/group_frame.py +89 -42
- flowfile_frame/join.py +1 -2
- flowfile_frame/lazy.py +704 -0
- flowfile_frame/lazy_methods.py +201 -0
- flowfile_frame/list_name_space.py +324 -0
- flowfile_frame/selectors.py +3 -0
- flowfile_frame/series.py +70 -0
- flowfile_frame/utils.py +80 -4
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/LICENSE +0 -0
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/WHEEL +0 -0
- {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/entry_points.txt +0 -0
- /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
- /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
flowfile_frame/group_frame.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
|
|
2
|
-
from flowfile_frame.expr import col, Expr
|
|
1
|
+
from flowfile_frame.expr import col, Expr, Column, lit
|
|
3
2
|
from flowfile_frame.selectors import Selector
|
|
4
3
|
from flowfile_frame.utils import _parse_inputs_as_iterable
|
|
5
4
|
from flowfile_core.schemas import transform_schema, input_schema
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
from flowfile_frame.utils import _check_if_convertible_to_code, ensure_inputs_as_iterable, get_pl_expr_from_expr
|
|
7
7
|
|
|
8
|
-
# Corrected TYPE_CHECKING block as provided by user
|
|
9
8
|
if TYPE_CHECKING:
|
|
10
9
|
from flowfile_frame.flow_frame import FlowFrame
|
|
11
10
|
else:
|
|
@@ -16,13 +15,34 @@ class GroupByFrame:
|
|
|
16
15
|
"""Represents a grouped DataFrame for aggregation operations."""
|
|
17
16
|
|
|
18
17
|
def __init__(self, node_id: int, parent_frame, by_cols, maintain_order=False, description: str = None):
|
|
18
|
+
"""Initialize a GroupByFrame instance.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
node_id: Unique identifier for this node
|
|
22
|
+
parent_frame: The parent FlowFrame this group operation is applied to
|
|
23
|
+
by_cols: Columns to group by
|
|
24
|
+
maintain_order: Whether to maintain original order
|
|
25
|
+
description: Optional description for this operation
|
|
26
|
+
"""
|
|
19
27
|
self.parent = parent_frame
|
|
20
28
|
self.by_cols = _parse_inputs_as_iterable(by_cols)
|
|
29
|
+
self.expr_by_cols = [self._create_expr_col(c) for c in self.by_cols]
|
|
21
30
|
self.maintain_order = maintain_order
|
|
22
31
|
self.description = description
|
|
23
32
|
self.node_id = node_id
|
|
24
33
|
|
|
34
|
+
@staticmethod
|
|
35
|
+
def _create_expr_col(col_: Any) -> Expr:
|
|
36
|
+
"""Convert various column specifications to Expr objects."""
|
|
37
|
+
if isinstance(col_, str):
|
|
38
|
+
return col(col_)
|
|
39
|
+
elif isinstance(col_, (Column, Expr)):
|
|
40
|
+
return col_
|
|
41
|
+
else:
|
|
42
|
+
return lit(col_)
|
|
43
|
+
|
|
25
44
|
def readable_group(self):
|
|
45
|
+
"""Generate a readable string representation of grouping columns."""
|
|
26
46
|
parts = []
|
|
27
47
|
for c in self.by_cols:
|
|
28
48
|
if isinstance(c, Expr):
|
|
@@ -34,24 +54,25 @@ class GroupByFrame:
|
|
|
34
54
|
return ", ".join(parts)
|
|
35
55
|
|
|
36
56
|
def len(self) -> 'FlowFrame':
|
|
37
|
-
"""
|
|
38
|
-
Count number of rows per group. Output column is named 'len'.
|
|
39
|
-
"""
|
|
40
|
-
# Uses direct code generation as per user's example
|
|
57
|
+
"""Count number of rows per group. Output column is named 'len'."""
|
|
41
58
|
return self._generate_direct_polars_code("len")
|
|
42
59
|
|
|
43
60
|
def count(self) -> 'FlowFrame':
|
|
44
|
-
"""
|
|
45
|
-
Count number of rows per group. Output column is named 'count'.
|
|
46
|
-
"""
|
|
47
|
-
# Uses direct code generation as per user's example
|
|
61
|
+
"""Count number of rows per group. Output column is named 'count'."""
|
|
48
62
|
return self._generate_direct_polars_code("count")
|
|
49
63
|
|
|
50
64
|
def agg(self, *agg_exprs, **named_agg_exprs) -> FlowFrame:
|
|
51
|
-
"""
|
|
52
|
-
|
|
65
|
+
"""Apply explicit aggregations to grouped data using expressions.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
*agg_exprs: Aggregation expressions to apply
|
|
69
|
+
**named_agg_exprs: Named aggregation expressions
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
FlowFrame: New frame with aggregated results
|
|
53
73
|
"""
|
|
54
74
|
agg_expressions = _parse_inputs_as_iterable(agg_exprs)
|
|
75
|
+
convertable_to_code = _check_if_convertible_to_code(agg_expressions)
|
|
55
76
|
can_be_converted: bool = not self.maintain_order
|
|
56
77
|
agg_cols: list[transform_schema.AggColl] = []
|
|
57
78
|
if can_be_converted:
|
|
@@ -61,34 +82,37 @@ class GroupByFrame:
|
|
|
61
82
|
if can_be_converted:
|
|
62
83
|
can_be_converted = self._process_named_agg_expressions(agg_cols, named_agg_exprs)
|
|
63
84
|
node_desc = self.description or f"Aggregate after grouping by {self.readable_group()}"
|
|
64
|
-
return self._create_agg_node(self.node_id, can_be_converted, agg_cols, agg_expressions, named_agg_exprs,
|
|
85
|
+
return self._create_agg_node(self.node_id, can_be_converted, agg_cols, agg_expressions, named_agg_exprs,
|
|
86
|
+
convertable_to_code=convertable_to_code, description=node_desc)
|
|
65
87
|
|
|
66
88
|
def _process_group_columns(self, agg_cols: list[transform_schema.AggColl]) -> bool:
|
|
67
|
-
|
|
89
|
+
"""Process grouping columns for aggregation schema."""
|
|
68
90
|
for col_expr in self.by_cols:
|
|
69
91
|
if isinstance(col_expr, str):
|
|
70
92
|
agg_cols.append(transform_schema.AggColl(old_name=col_expr, agg="groupby"))
|
|
71
93
|
elif isinstance(col_expr, Expr):
|
|
72
|
-
agg_cols.append(transform_schema.AggColl(old_name=col_expr.
|
|
94
|
+
agg_cols.append(transform_schema.AggColl(old_name=col_expr.column_name, agg="groupby"))
|
|
73
95
|
elif isinstance(col_expr, Selector):
|
|
74
96
|
return False
|
|
75
97
|
else:
|
|
76
|
-
|
|
98
|
+
return False
|
|
77
99
|
return True
|
|
78
100
|
|
|
79
101
|
@staticmethod
|
|
80
102
|
def _process_agg_expressions(agg_cols: list[transform_schema.AggColl], agg_expressions) -> bool:
|
|
81
|
-
|
|
103
|
+
"""Process aggregation expressions for schema conversion."""
|
|
82
104
|
for expr in agg_expressions:
|
|
83
105
|
if isinstance(expr, Expr):
|
|
106
|
+
if expr.is_complex:
|
|
107
|
+
return False
|
|
84
108
|
agg_func = getattr(expr, "agg_func", None)
|
|
85
|
-
old_name = getattr(expr, "_initial_column_name", expr.
|
|
109
|
+
old_name = getattr(expr, "_initial_column_name", expr.column_name) or expr.column_name
|
|
86
110
|
if agg_func:
|
|
87
111
|
agg_cols.append(
|
|
88
|
-
transform_schema.AggColl(old_name=old_name, agg=agg_func, new_name=expr.
|
|
112
|
+
transform_schema.AggColl(old_name=old_name, agg=agg_func, new_name=expr.column_name or old_name)
|
|
89
113
|
)
|
|
90
114
|
else:
|
|
91
|
-
agg_cols.append(transform_schema.AggColl(old_name=expr.
|
|
115
|
+
agg_cols.append(transform_schema.AggColl(old_name=expr.column_name, agg="first"))
|
|
92
116
|
elif isinstance(expr, str):
|
|
93
117
|
agg_cols.append(transform_schema.AggColl(old_name=expr, agg="first"))
|
|
94
118
|
elif isinstance(expr, Selector):
|
|
@@ -99,19 +123,20 @@ class GroupByFrame:
|
|
|
99
123
|
|
|
100
124
|
@staticmethod
|
|
101
125
|
def _process_named_agg_expressions(agg_cols: list[transform_schema.AggColl], named_agg_exprs: dict) -> bool:
|
|
126
|
+
"""Process named aggregation expressions for schema conversion."""
|
|
102
127
|
for name, expr in named_agg_exprs.items():
|
|
103
128
|
if expr.is_complex:
|
|
104
129
|
return False
|
|
105
130
|
if isinstance(expr, Expr):
|
|
106
131
|
agg_func = getattr(expr, "agg_func", "first")
|
|
107
|
-
old_name = getattr(expr, "_initial_column_name", expr.
|
|
132
|
+
old_name = getattr(expr, "_initial_column_name", expr.column_name) or expr.column_name
|
|
108
133
|
agg_cols.append(transform_schema.AggColl(old_name=old_name, agg=agg_func, new_name=name))
|
|
109
134
|
elif isinstance(expr, str):
|
|
110
135
|
agg_cols.append(transform_schema.AggColl(old_name=expr, agg="first", new_name=name))
|
|
111
136
|
elif isinstance(expr, tuple) and len(expr) == 2:
|
|
112
137
|
col_spec, agg_func_str = expr
|
|
113
138
|
if isinstance(col_spec, Expr):
|
|
114
|
-
old_name = getattr(col_spec, "_initial_column_name", col_spec.
|
|
139
|
+
old_name = getattr(col_spec, "_initial_column_name", col_spec.column_name) or col_spec.column_name
|
|
115
140
|
elif isinstance(col_spec, str):
|
|
116
141
|
old_name = col_spec
|
|
117
142
|
else:
|
|
@@ -123,9 +148,9 @@ class GroupByFrame:
|
|
|
123
148
|
return False
|
|
124
149
|
return True
|
|
125
150
|
|
|
126
|
-
def _create_agg_node(self, node_id_to_use: int, can_be_converted: bool, agg_cols: list, agg_expressions,
|
|
127
|
-
|
|
128
|
-
|
|
151
|
+
def _create_agg_node(self, node_id_to_use: int, can_be_converted: bool, agg_cols: list, agg_expressions,
|
|
152
|
+
named_agg_exprs, convertable_to_code: bool, description: str):
|
|
153
|
+
"""Create node for explicit aggregations via self.agg()."""
|
|
129
154
|
if can_be_converted:
|
|
130
155
|
group_by_settings = input_schema.NodeGroupBy(
|
|
131
156
|
flow_id=self.parent.flow_graph.flow_id,
|
|
@@ -138,34 +163,44 @@ class GroupByFrame:
|
|
|
138
163
|
self.parent.flow_graph.add_group_by(group_by_settings)
|
|
139
164
|
else:
|
|
140
165
|
code = self._generate_polars_agg_code(agg_expressions, named_agg_exprs)
|
|
141
|
-
|
|
166
|
+
pl_agg_expressions = list(map(get_pl_expr_from_expr, ensure_inputs_as_iterable(agg_expressions)))
|
|
167
|
+
pl_group_expr = list(map(get_pl_expr_from_expr, ensure_inputs_as_iterable(self.expr_by_cols)))
|
|
168
|
+
pl_kwargs_expr = {k: self._create_expr_col(c).expr for k, c in named_agg_exprs.items()}
|
|
169
|
+
self.parent._add_polars_code(new_node_id=node_id_to_use, code=code, description=description,
|
|
170
|
+
method_name='group_by', convertable_to_code=convertable_to_code,
|
|
171
|
+
polars_expr=pl_agg_expressions, group_expr=pl_group_expr,
|
|
172
|
+
kwargs_expr=pl_kwargs_expr,
|
|
173
|
+
group_kwargs={'maintain_order': self.maintain_order})
|
|
142
174
|
return self.parent._create_child_frame(node_id_to_use)
|
|
143
175
|
|
|
144
|
-
def _generate_direct_polars_code(self, method_name: str) -> "FlowFrame":
|
|
145
|
-
"""
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
176
|
+
def _generate_direct_polars_code(self, method_name: str, *args, **kwargs) -> "FlowFrame":
|
|
177
|
+
"""Generate Polars code for simple GroupBy methods like sum(), mean(), len(), count().
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
method_name: Name of the aggregation method
|
|
181
|
+
*args: Positional arguments for the method
|
|
182
|
+
**kwargs: Keyword arguments for the method
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
FlowFrame: New child frame with the operation applied
|
|
149
186
|
"""
|
|
150
187
|
readable_group_str = self.readable_group()
|
|
151
|
-
|
|
188
|
+
execution = "(" + ",".join(args) + ",".join([f"{k}={v}" for k, v in kwargs.items()]) + ")"
|
|
189
|
+
|
|
190
|
+
code = f"input_df.group_by([{readable_group_str}], maintain_order={self.maintain_order}).{method_name}{execution}"
|
|
152
191
|
node_description = self.description or f"{method_name.capitalize()} after grouping by {readable_group_str}"
|
|
153
192
|
self.parent._add_polars_code(new_node_id=self.node_id, code=code, description=node_description)
|
|
154
193
|
return self.parent._create_child_frame(self.node_id)
|
|
155
194
|
|
|
156
195
|
def _generate_polars_agg_code(self, agg_expressions, named_agg_exprs) -> str:
|
|
157
196
|
"""Generate Polars code specifically for explicit .agg() calls."""
|
|
158
|
-
# (Implementation unchanged from user input)
|
|
159
197
|
readable_group_str = self.readable_group()
|
|
160
198
|
agg_strs = [str(expr) for expr in agg_expressions]
|
|
161
199
|
named_agg_strs = [f"{name}={str(expr)}" for name, expr in named_agg_exprs.items()]
|
|
162
200
|
all_agg_strs = agg_strs + named_agg_strs
|
|
163
201
|
agg_combined = ", ".join(all_agg_strs)
|
|
164
|
-
# Assuming input dataframe is 'input_df' in execution context
|
|
165
202
|
return f"input_df.group_by([{readable_group_str}], maintain_order={self.maintain_order}).agg({agg_combined})"
|
|
166
203
|
|
|
167
|
-
# --- Convenience Methods (No Column Args - Use Direct Code Gen) ---
|
|
168
|
-
|
|
169
204
|
def sum(self):
|
|
170
205
|
"""Calculate sum for all non-grouping columns."""
|
|
171
206
|
return self._generate_direct_polars_code("sum")
|
|
@@ -180,20 +215,32 @@ class GroupByFrame:
|
|
|
180
215
|
|
|
181
216
|
def min(self):
|
|
182
217
|
"""Calculate minimum for all non-grouping columns."""
|
|
183
|
-
# Remove *columns argument
|
|
184
218
|
return self._generate_direct_polars_code("min")
|
|
185
219
|
|
|
186
220
|
def max(self):
|
|
187
221
|
"""Calculate maximum for all non-grouping columns."""
|
|
188
|
-
# Remove *columns argument
|
|
189
222
|
return self._generate_direct_polars_code("max")
|
|
190
223
|
|
|
224
|
+
def tail(self, n: int = 10):
|
|
225
|
+
"""Get last n rows for all non-grouping columns.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
n: Number of rows to return (default: 10)
|
|
229
|
+
"""
|
|
230
|
+
return self._generate_direct_polars_code("tail", n=n)
|
|
231
|
+
|
|
232
|
+
def head(self, n: int = 10):
|
|
233
|
+
"""Get the first n rows for all non-grouping columns.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
n: Number of rows to return (default: 10)
|
|
237
|
+
"""
|
|
238
|
+
return self._generate_direct_polars_code("head", n=n)
|
|
239
|
+
|
|
191
240
|
def first(self):
|
|
192
241
|
"""Get first value for all non-grouping columns."""
|
|
193
|
-
# Remove *columns argument
|
|
194
242
|
return self._generate_direct_polars_code("first")
|
|
195
243
|
|
|
196
244
|
def last(self):
|
|
197
245
|
"""Get last value for all non-grouping columns."""
|
|
198
|
-
# Remove *columns argument
|
|
199
246
|
return self._generate_direct_polars_code("last")
|
flowfile_frame/join.py
CHANGED
|
@@ -39,11 +39,10 @@ def _extract_column_name(col_expr):
|
|
|
39
39
|
if isinstance(col_expr, Column):
|
|
40
40
|
# If it's a simple unaltered column, use its name
|
|
41
41
|
if not col_expr._select_input.is_altered:
|
|
42
|
-
return col_expr.
|
|
42
|
+
return col_expr.column_name, False
|
|
43
43
|
# Otherwise, this requires polars code
|
|
44
44
|
return col_expr, True
|
|
45
45
|
|
|
46
|
-
# Any other expression type needs polars code
|
|
47
46
|
return col_expr, True
|
|
48
47
|
|
|
49
48
|
|