Flowfile 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (46) hide show
  1. flowfile/__init__.py +2 -1
  2. flowfile/web/__init__.py +3 -0
  3. {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/METADATA +1 -1
  4. {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/RECORD +46 -35
  5. flowfile_core/configs/__init__.py +15 -4
  6. flowfile_core/configs/settings.py +5 -3
  7. flowfile_core/configs/utils.py +18 -0
  8. flowfile_core/flowfile/FlowfileFlow.py +13 -18
  9. flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
  10. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +54 -17
  11. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
  12. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
  13. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +2 -1
  14. flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
  15. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
  16. flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
  17. flowfile_core/flowfile/flow_node/flow_node.py +2 -1
  18. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
  19. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
  20. flowfile_core/flowfile/utils.py +34 -3
  21. flowfile_core/main.py +2 -3
  22. flowfile_core/routes/secrets.py +1 -1
  23. flowfile_core/schemas/input_schema.py +10 -4
  24. flowfile_core/schemas/transform_schema.py +25 -47
  25. flowfile_frame/__init__.py +11 -4
  26. flowfile_frame/adding_expr.py +280 -0
  27. flowfile_frame/config.py +9 -0
  28. flowfile_frame/expr.py +301 -83
  29. flowfile_frame/expr.pyi +2174 -0
  30. flowfile_frame/expr_name.py +258 -0
  31. flowfile_frame/flow_frame.py +587 -1002
  32. flowfile_frame/flow_frame.pyi +336 -0
  33. flowfile_frame/flow_frame_methods.py +617 -0
  34. flowfile_frame/group_frame.py +89 -42
  35. flowfile_frame/join.py +1 -2
  36. flowfile_frame/lazy.py +704 -0
  37. flowfile_frame/lazy_methods.py +201 -0
  38. flowfile_frame/list_name_space.py +324 -0
  39. flowfile_frame/selectors.py +3 -0
  40. flowfile_frame/series.py +70 -0
  41. flowfile_frame/utils.py +80 -4
  42. {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/LICENSE +0 -0
  43. {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/WHEEL +0 -0
  44. {flowfile-0.3.2.dist-info → flowfile-0.3.3.dist-info}/entry_points.txt +0 -0
  45. /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
  46. /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
@@ -1,11 +1,10 @@
1
-
2
- from flowfile_frame.expr import col, Expr
1
+ from flowfile_frame.expr import col, Expr, Column, lit
3
2
  from flowfile_frame.selectors import Selector
4
3
  from flowfile_frame.utils import _parse_inputs_as_iterable
5
4
  from flowfile_core.schemas import transform_schema, input_schema
6
- from typing import TYPE_CHECKING
5
+ from typing import TYPE_CHECKING, Any
6
+ from flowfile_frame.utils import _check_if_convertible_to_code, ensure_inputs_as_iterable, get_pl_expr_from_expr
7
7
 
8
- # Corrected TYPE_CHECKING block as provided by user
9
8
  if TYPE_CHECKING:
10
9
  from flowfile_frame.flow_frame import FlowFrame
11
10
  else:
@@ -16,13 +15,34 @@ class GroupByFrame:
16
15
  """Represents a grouped DataFrame for aggregation operations."""
17
16
 
18
17
  def __init__(self, node_id: int, parent_frame, by_cols, maintain_order=False, description: str = None):
18
+ """Initialize a GroupByFrame instance.
19
+
20
+ Args:
21
+ node_id: Unique identifier for this node
22
+ parent_frame: The parent FlowFrame this group operation is applied to
23
+ by_cols: Columns to group by
24
+ maintain_order: Whether to maintain original order
25
+ description: Optional description for this operation
26
+ """
19
27
  self.parent = parent_frame
20
28
  self.by_cols = _parse_inputs_as_iterable(by_cols)
29
+ self.expr_by_cols = [self._create_expr_col(c) for c in self.by_cols]
21
30
  self.maintain_order = maintain_order
22
31
  self.description = description
23
32
  self.node_id = node_id
24
33
 
34
+ @staticmethod
35
+ def _create_expr_col(col_: Any) -> Expr:
36
+ """Convert various column specifications to Expr objects."""
37
+ if isinstance(col_, str):
38
+ return col(col_)
39
+ elif isinstance(col_, (Column, Expr)):
40
+ return col_
41
+ else:
42
+ return lit(col_)
43
+
25
44
  def readable_group(self):
45
+ """Generate a readable string representation of grouping columns."""
26
46
  parts = []
27
47
  for c in self.by_cols:
28
48
  if isinstance(c, Expr):
@@ -34,24 +54,25 @@ class GroupByFrame:
34
54
  return ", ".join(parts)
35
55
 
36
56
  def len(self) -> 'FlowFrame':
37
- """
38
- Count number of rows per group. Output column is named 'len'.
39
- """
40
- # Uses direct code generation as per user's example
57
+ """Count number of rows per group. Output column is named 'len'."""
41
58
  return self._generate_direct_polars_code("len")
42
59
 
43
60
  def count(self) -> 'FlowFrame':
44
- """
45
- Count number of rows per group. Output column is named 'count'.
46
- """
47
- # Uses direct code generation as per user's example
61
+ """Count number of rows per group. Output column is named 'count'."""
48
62
  return self._generate_direct_polars_code("count")
49
63
 
50
64
  def agg(self, *agg_exprs, **named_agg_exprs) -> FlowFrame:
51
- """
52
- Apply EXPLICIT aggregations to grouped data using expressions.
65
+ """Apply explicit aggregations to grouped data using expressions.
66
+
67
+ Args:
68
+ *agg_exprs: Aggregation expressions to apply
69
+ **named_agg_exprs: Named aggregation expressions
70
+
71
+ Returns:
72
+ FlowFrame: New frame with aggregated results
53
73
  """
54
74
  agg_expressions = _parse_inputs_as_iterable(agg_exprs)
75
+ convertable_to_code = _check_if_convertible_to_code(agg_expressions)
55
76
  can_be_converted: bool = not self.maintain_order
56
77
  agg_cols: list[transform_schema.AggColl] = []
57
78
  if can_be_converted:
@@ -61,34 +82,37 @@ class GroupByFrame:
61
82
  if can_be_converted:
62
83
  can_be_converted = self._process_named_agg_expressions(agg_cols, named_agg_exprs)
63
84
  node_desc = self.description or f"Aggregate after grouping by {self.readable_group()}"
64
- return self._create_agg_node(self.node_id, can_be_converted, agg_cols, agg_expressions, named_agg_exprs, node_desc)
85
+ return self._create_agg_node(self.node_id, can_be_converted, agg_cols, agg_expressions, named_agg_exprs,
86
+ convertable_to_code=convertable_to_code, description=node_desc)
65
87
 
66
88
  def _process_group_columns(self, agg_cols: list[transform_schema.AggColl]) -> bool:
67
- # (Implementation unchanged from user input)
89
+ """Process grouping columns for aggregation schema."""
68
90
  for col_expr in self.by_cols:
69
91
  if isinstance(col_expr, str):
70
92
  agg_cols.append(transform_schema.AggColl(old_name=col_expr, agg="groupby"))
71
93
  elif isinstance(col_expr, Expr):
72
- agg_cols.append(transform_schema.AggColl(old_name=col_expr.name, agg="groupby"))
94
+ agg_cols.append(transform_schema.AggColl(old_name=col_expr.column_name, agg="groupby"))
73
95
  elif isinstance(col_expr, Selector):
74
96
  return False
75
97
  else:
76
- return False
98
+ return False
77
99
  return True
78
100
 
79
101
  @staticmethod
80
102
  def _process_agg_expressions(agg_cols: list[transform_schema.AggColl], agg_expressions) -> bool:
81
- # (Implementation unchanged from user input)
103
+ """Process aggregation expressions for schema conversion."""
82
104
  for expr in agg_expressions:
83
105
  if isinstance(expr, Expr):
106
+ if expr.is_complex:
107
+ return False
84
108
  agg_func = getattr(expr, "agg_func", None)
85
- old_name = getattr(expr, "_initial_column_name", expr.name) or expr.name
109
+ old_name = getattr(expr, "_initial_column_name", expr.column_name) or expr.column_name
86
110
  if agg_func:
87
111
  agg_cols.append(
88
- transform_schema.AggColl(old_name=old_name, agg=agg_func, new_name=expr.name)
112
+ transform_schema.AggColl(old_name=old_name, agg=agg_func, new_name=expr.column_name or old_name)
89
113
  )
90
114
  else:
91
- agg_cols.append(transform_schema.AggColl(old_name=expr.name, agg="first"))
115
+ agg_cols.append(transform_schema.AggColl(old_name=expr.column_name, agg="first"))
92
116
  elif isinstance(expr, str):
93
117
  agg_cols.append(transform_schema.AggColl(old_name=expr, agg="first"))
94
118
  elif isinstance(expr, Selector):
@@ -99,19 +123,20 @@ class GroupByFrame:
99
123
 
100
124
  @staticmethod
101
125
  def _process_named_agg_expressions(agg_cols: list[transform_schema.AggColl], named_agg_exprs: dict) -> bool:
126
+ """Process named aggregation expressions for schema conversion."""
102
127
  for name, expr in named_agg_exprs.items():
103
128
  if expr.is_complex:
104
129
  return False
105
130
  if isinstance(expr, Expr):
106
131
  agg_func = getattr(expr, "agg_func", "first")
107
- old_name = getattr(expr, "_initial_column_name", expr.name) or expr.name
132
+ old_name = getattr(expr, "_initial_column_name", expr.column_name) or expr.column_name
108
133
  agg_cols.append(transform_schema.AggColl(old_name=old_name, agg=agg_func, new_name=name))
109
134
  elif isinstance(expr, str):
110
135
  agg_cols.append(transform_schema.AggColl(old_name=expr, agg="first", new_name=name))
111
136
  elif isinstance(expr, tuple) and len(expr) == 2:
112
137
  col_spec, agg_func_str = expr
113
138
  if isinstance(col_spec, Expr):
114
- old_name = getattr(col_spec, "_initial_column_name", col_spec.name) or col_spec.name
139
+ old_name = getattr(col_spec, "_initial_column_name", col_spec.column_name) or col_spec.column_name
115
140
  elif isinstance(col_spec, str):
116
141
  old_name = col_spec
117
142
  else:
@@ -123,9 +148,9 @@ class GroupByFrame:
123
148
  return False
124
149
  return True
125
150
 
126
- def _create_agg_node(self, node_id_to_use: int, can_be_converted: bool, agg_cols: list, agg_expressions, named_agg_exprs, description: str):
127
- """Creates node for explicit aggregations via self.agg()"""
128
- # (Implementation unchanged from user input, passes description)
151
+ def _create_agg_node(self, node_id_to_use: int, can_be_converted: bool, agg_cols: list, agg_expressions,
152
+ named_agg_exprs, convertable_to_code: bool, description: str):
153
+ """Create node for explicit aggregations via self.agg()."""
129
154
  if can_be_converted:
130
155
  group_by_settings = input_schema.NodeGroupBy(
131
156
  flow_id=self.parent.flow_graph.flow_id,
@@ -138,34 +163,44 @@ class GroupByFrame:
138
163
  self.parent.flow_graph.add_group_by(group_by_settings)
139
164
  else:
140
165
  code = self._generate_polars_agg_code(agg_expressions, named_agg_exprs)
141
- self.parent._add_polars_code(new_node_id=node_id_to_use, code=code, description=description)
166
+ pl_agg_expressions = list(map(get_pl_expr_from_expr, ensure_inputs_as_iterable(agg_expressions)))
167
+ pl_group_expr = list(map(get_pl_expr_from_expr, ensure_inputs_as_iterable(self.expr_by_cols)))
168
+ pl_kwargs_expr = {k: self._create_expr_col(c).expr for k, c in named_agg_exprs.items()}
169
+ self.parent._add_polars_code(new_node_id=node_id_to_use, code=code, description=description,
170
+ method_name='group_by', convertable_to_code=convertable_to_code,
171
+ polars_expr=pl_agg_expressions, group_expr=pl_group_expr,
172
+ kwargs_expr=pl_kwargs_expr,
173
+ group_kwargs={'maintain_order': self.maintain_order})
142
174
  return self.parent._create_child_frame(node_id_to_use)
143
175
 
144
- def _generate_direct_polars_code(self, method_name: str) -> "FlowFrame":
145
- """
146
- Generates Polars code for simple GroupBy methods like sum(), mean(), len(), count()
147
- which operate implicitly or have a standard Polars counterpart.
148
- Always uses the Polars code path.
176
+ def _generate_direct_polars_code(self, method_name: str, *args, **kwargs) -> "FlowFrame":
177
+ """Generate Polars code for simple GroupBy methods like sum(), mean(), len(), count().
178
+
179
+ Args:
180
+ method_name: Name of the aggregation method
181
+ *args: Positional arguments for the method
182
+ **kwargs: Keyword arguments for the method
183
+
184
+ Returns:
185
+ FlowFrame: New child frame with the operation applied
149
186
  """
150
187
  readable_group_str = self.readable_group()
151
- code = f"input_df.group_by([{readable_group_str}], maintain_order={self.maintain_order}).{method_name}()"
188
+ execution = "(" + ",".join(args) + ",".join([f"{k}={v}" for k, v in kwargs.items()]) + ")"
189
+
190
+ code = f"input_df.group_by([{readable_group_str}], maintain_order={self.maintain_order}).{method_name}{execution}"
152
191
  node_description = self.description or f"{method_name.capitalize()} after grouping by {readable_group_str}"
153
192
  self.parent._add_polars_code(new_node_id=self.node_id, code=code, description=node_description)
154
193
  return self.parent._create_child_frame(self.node_id)
155
194
 
156
195
  def _generate_polars_agg_code(self, agg_expressions, named_agg_exprs) -> str:
157
196
  """Generate Polars code specifically for explicit .agg() calls."""
158
- # (Implementation unchanged from user input)
159
197
  readable_group_str = self.readable_group()
160
198
  agg_strs = [str(expr) for expr in agg_expressions]
161
199
  named_agg_strs = [f"{name}={str(expr)}" for name, expr in named_agg_exprs.items()]
162
200
  all_agg_strs = agg_strs + named_agg_strs
163
201
  agg_combined = ", ".join(all_agg_strs)
164
- # Assuming input dataframe is 'input_df' in execution context
165
202
  return f"input_df.group_by([{readable_group_str}], maintain_order={self.maintain_order}).agg({agg_combined})"
166
203
 
167
- # --- Convenience Methods (No Column Args - Use Direct Code Gen) ---
168
-
169
204
  def sum(self):
170
205
  """Calculate sum for all non-grouping columns."""
171
206
  return self._generate_direct_polars_code("sum")
@@ -180,20 +215,32 @@ class GroupByFrame:
180
215
 
181
216
  def min(self):
182
217
  """Calculate minimum for all non-grouping columns."""
183
- # Remove *columns argument
184
218
  return self._generate_direct_polars_code("min")
185
219
 
186
220
  def max(self):
187
221
  """Calculate maximum for all non-grouping columns."""
188
- # Remove *columns argument
189
222
  return self._generate_direct_polars_code("max")
190
223
 
224
+ def tail(self, n: int = 10):
225
+ """Get last n rows for all non-grouping columns.
226
+
227
+ Args:
228
+ n: Number of rows to return (default: 10)
229
+ """
230
+ return self._generate_direct_polars_code("tail", n=n)
231
+
232
+ def head(self, n: int = 10):
233
+ """Get the first n rows for all non-grouping columns.
234
+
235
+ Args:
236
+ n: Number of rows to return (default: 10)
237
+ """
238
+ return self._generate_direct_polars_code("head", n=n)
239
+
191
240
  def first(self):
192
241
  """Get first value for all non-grouping columns."""
193
- # Remove *columns argument
194
242
  return self._generate_direct_polars_code("first")
195
243
 
196
244
  def last(self):
197
245
  """Get last value for all non-grouping columns."""
198
- # Remove *columns argument
199
246
  return self._generate_direct_polars_code("last")
flowfile_frame/join.py CHANGED
@@ -39,11 +39,10 @@ def _extract_column_name(col_expr):
39
39
  if isinstance(col_expr, Column):
40
40
  # If it's a simple unaltered column, use its name
41
41
  if not col_expr._select_input.is_altered:
42
- return col_expr.name, False
42
+ return col_expr.column_name, False
43
43
  # Otherwise, this requires polars code
44
44
  return col_expr, True
45
45
 
46
- # Any other expression type needs polars code
47
46
  return col_expr, True
48
47
 
49
48